Пример #1
0
    def test_unknown_uris(self):

        unique_filter = UniqueUriFilter('sha1')

        self.assertFalse(
            unique_filter.is_known("http://www.google.de",
                                   add_if_unknown=True))
        self.assertFalse(
            unique_filter.is_known("http://www.yahoo.com",
                                   add_if_unknown=True))
        self.assertTrue(unique_filter.is_known("http://www.google.de"))
        self.assertTrue(unique_filter.is_known("http://www.yahoo.com"))
Пример #2
0
    def __init__(self,
                 settings,
                 log_handler,
                 front_end_queues,
                 prioritizer,
                 unique_hash='sha1'):
        """
        Initialize the frontier and instantiate the
        :class:`SQLiteSingleHostUriQueue`.

        The default frontier we will use the `sha1` hash function for the
        unique uri filter. For very large crawls you might want to use a
        larger hash function (`sha512`, e.g.)
        """
        LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER)
        # front end queue
        self._prioritizer = prioritizer
        self._front_end_queues = front_end_queues
        # checkpointing
        self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING
        self._uris_added = 0

        # the heap
        self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE)
        self._heap_min_size = settings.FRONTIER_HEAP_MIN

        # a list of uris currently being crawled.
        self._current_uris = dict()
        # dns cache
        self._dns_cache = DnsCache(settings)
        # unique uri filter
        self._unique_uri = UniqueUriFilter(unique_hash)
        for url in self._front_end_queues.all_uris():
            assert not self._unique_uri.is_known(url, add_if_unknown=True)

        # the sinks
        self._sinks = []

        # timezone
        self._timezone = settings.LOCAL_TIMEZONE
        self._logger.info("frontier::initialized")