def test_unknown_uris(self): unique_filter = UniqueUriFilter("sha1") self.assertFalse(unique_filter.is_known("http://www.google.de", add_if_unknown=True)) self.assertFalse(unique_filter.is_known("http://www.yahoo.com", add_if_unknown=True)) self.assertTrue(unique_filter.is_known("http://www.google.de")) self.assertTrue(unique_filter.is_known("http://www.yahoo.com"))
def test_unknown_uris(self): unique_filter = UniqueUriFilter('sha1') self.assertFalse( unique_filter.is_known("http://www.google.de", add_if_unknown=True)) self.assertFalse( unique_filter.is_known("http://www.yahoo.com", add_if_unknown=True)) self.assertTrue(unique_filter.is_known("http://www.google.de")) self.assertTrue(unique_filter.is_known("http://www.yahoo.com"))
def __init__(self, settings, log_handler, front_end_queues, prioritizer, unique_hash='sha1'): """ Initialize the frontier and instantiate the :class:`SQLiteSingleHostUriQueue`. The default frontier we will use the `sha1` hash function for the unique uri filter. For very large crawls you might want to use a larger hash function (`sha512`, e.g.) """ LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER) # front end queue self._prioritizer = prioritizer self._front_end_queues = front_end_queues # checkpointing self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING self._uris_added = 0 # the heap self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE) self._heap_min_size = settings.FRONTIER_HEAP_MIN # a list of uris currently being crawled. self._current_uris = dict() # dns cache self._dns_cache = DnsCache(settings) # unique uri filter self._unique_uri = UniqueUriFilter(unique_hash) for url in self._front_end_queues.all_uris(): assert not self._unique_uri.is_known(url, add_if_unknown=True) # the sinks self._sinks = [] # timezone self._timezone = settings.LOCAL_TIMEZONE self._logger.info("frontier::initialized")
class AbstractBaseFrontier(object, LoggingMixin): """ A base class for implementing frontiers. Basically this class provides the different general methods and configuration parameters used for frontiers. """ def __init__(self, settings, log_handler, front_end_queues, prioritizer, unique_hash='sha1'): """ Initialize the frontier and instantiate the :class:`SQLiteSingleHostUriQueue`. The default frontier we will use the `sha1` hash function for the unique uri filter. For very large crawls you might want to use a larger hash function (`sha512`, e.g.) """ LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER) # front end queue self._prioritizer = prioritizer self._front_end_queues = front_end_queues # checkpointing self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING self._uris_added = 0 # the heap self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE) self._heap_min_size = settings.FRONTIER_HEAP_MIN # a list of uris currently being crawled. self._current_uris = dict() # dns cache self._dns_cache = DnsCache(settings) # unique uri filter self._unique_uri = UniqueUriFilter(unique_hash) for url in self._front_end_queues.all_uris(): assert not self._unique_uri.is_known(url, add_if_unknown=True) # the sinks self._sinks = [] # timezone self._timezone = settings.LOCAL_TIMEZONE self._logger.info("frontier::initialized") def add_sink(self, sink): """ Add a sink to the frontier. A sink will be responsible for the long term storage of the crawled contents. """ self._sinks.append(sink) def add_uri(self, curi): """ Add the specified :class:`CrawlUri` to the frontier. `next_date` is a datetime object for the next time the uri should be crawled. Note: time based crawling is never strict, it is generally used as some kind of prioritization. """ if self._unique_uri.is_known(curi.url, add_if_unknown=True): # we already know this uri self._logger.debug("frontier::Trying to update a known uri... " + \ "(%s)" % (curi.url,)) return self._logger.info("frontier::Adding '%s' to the frontier" % curi.url) self._front_end_queues.add_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def update_uri(self, curi): """ Update a given uri. """ self._front_end_queues.update_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def get_next(self): """ Return the next uri scheduled for crawling. """ if self._heap.qsize() < self._heap_min_size: self._update_heap() try: (_next_date, next_uri) = self._heap.get_nowait() except Empty: # heap is empty, there is nothing to crawl right now! # maybe log this in the future raise return self._crawluri_from_uri(next_uri) def close(self): """ Close the underlying frontend queues. """ self._front_end_queues.checkpoint() self._front_end_queues.close() def _add_to_heap(self, uri, next_date): """ Add an URI to the heap that is ready to be crawled. """ self._heap.put_nowait((next_date, uri)) (url, _etag, _mod_date, _next_date, _prio) = uri self._current_uris[url] = uri self._logger.debug("frontier::Adding '%s' to the heap" % url) def _reschedule_uri(self, curi): """ Return the `next_crawl_date` for :class:`CrawlUri`s. """ (prio, delta) = self._prioritizer.calculate_priority(curi) now = datetime.now(self._timezone) return (prio, time.mktime((now + delta).timetuple())) def _ignore_uri(self, curi): """ Ignore a :class:`CrawlUri` from now on. """ self._front_end_queues.ignore_uri(curi.url, curi.status_code) def _uri_from_curi(self, curi): """ Create the uri tuple from the :class:`CrawlUri` and calculate the priority. Overwrite this method in more specific frontiers. """ etag = mod_date = None if curi.rep_header: if "Etag" in curi.rep_header: etag = curi.rep_header["Etag"] if "Last-Modified" in curi.rep_header: mod_date = time.mktime(deserialize_date_time( curi.rep_header["Last-Modified"]).timetuple()) if not mod_date and 'Date' in curi.rep_header: mod_date = time.mktime(deserialize_date_time( curi.rep_header["Date"]).timetuple()) if mod_date: # only reschedule if it has been crawled before (prio, next_crawl_date) = self._reschedule_uri(curi) else: (prio, next_crawl_date) = (1, time.mktime(datetime.now(self._timezone).timetuple())) return (curi.url, etag, mod_date, next_crawl_date, prio) def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi def _update_heap(self): """ Abstract method. Implement this in the actual Frontier. The implementation should really only add uris to the heap if they can be downloaded right away. """ pass def _maybe_checkpoint(self, force_checkpoint=False): """ Periodically checkpoint the state db. """ self._uris_added += 1 if self._uris_added > self._checkpoint_interval or force_checkpoint: self._front_end_queues.checkpoint() self._uris_added = 0 def process_successful_crawl(self, curi): """ Called when an URI has been crawled successfully. `curi` is a :class:`CrawlUri` """ self.update_uri(curi) if curi.optional_vars and CURI_EXTRACTED_URLS in curi.optional_vars: for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"): if len(url) > 5 and not self._unique_uri.is_known(url): self.add_uri(CrawlUri(url)) del self._current_uris[curi.url] for sink in self._sinks: sink.process_successful_crawl(curi) def process_not_found(self, curi): """ Called when an URL was not found. This could mean, that the URL has been removed from the server. If so, do something about it! Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_not_found(curi) def process_redirect(self, curi): """ Called when there were too many redirects for an URL, or the site has note been updated since the last visit. In the latter case, update the internal uri and increase the priority level. """ del self._current_uris[curi.url] if curi.status_code in [301, 302]: # simply ignore the URL. The URL that is being redirected to is # extracted and added in the processing self._ignore_uri(curi) if curi.status_code == 304: # the page has not been modified since the last visit! Update it # NOTE: prio increasing happens in the prioritizer self.update_uri(curi) for sink in self._sinks: sink.process_redirect(curi) def process_server_error(self, curi): """ Called when there was some kind of server error. Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_server_error(curi)
class AbstractBaseFrontier(object, LoggingMixin): """ A base class for implementing frontiers. Basically this class provides the different general methods and configuration parameters used for frontiers. """ def __init__(self, settings, log_handler, front_end_queues, prioritizer, unique_hash='sha1'): """ Initialize the frontier and instantiate the :class:`SQLiteSingleHostUriQueue`. The default frontier we will use the `sha1` hash function for the unique uri filter. For very large crawls you might want to use a larger hash function (`sha512`, e.g.) """ LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER) # front end queue self._prioritizer = prioritizer self._front_end_queues = front_end_queues # checkpointing self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING self._uris_added = 0 # the heap self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE) self._heap_min_size = settings.FRONTIER_HEAP_MIN # a list of uris currently being crawled. self._current_uris = dict() # dns cache self._dns_cache = DnsCache(settings) # unique uri filter self._unique_uri = UniqueUriFilter(unique_hash) for url in self._front_end_queues.all_uris(): assert not self._unique_uri.is_known(url, add_if_unknown=True) # the sinks self._sinks = [] # timezone self._timezone = settings.LOCAL_TIMEZONE self._logger.info("frontier::initialized") def add_sink(self, sink): """ Add a sink to the frontier. A sink will be responsible for the long term storage of the crawled contents. """ self._sinks.append(sink) def add_uri(self, curi): """ Add the specified :class:`CrawlUri` to the frontier. `next_date` is a datetime object for the next time the uri should be crawled. Note: time based crawling is never strict, it is generally used as some kind of prioritization. """ if self._unique_uri.is_known(curi.url, add_if_unknown=True): # we already know this uri self._logger.debug("frontier::Trying to update a known uri... " + \ "(%s)" % (curi.url,)) return self._logger.info("frontier::Adding '%s' to the frontier" % curi.url) self._front_end_queues.add_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def update_uri(self, curi): """ Update a given uri. """ self._front_end_queues.update_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def get_next(self): """ Return the next uri scheduled for crawling. """ if self._heap.qsize() < self._heap_min_size: self._update_heap() try: (_next_date, next_uri) = self._heap.get_nowait() except Empty: # heap is empty, there is nothing to crawl right now! # maybe log this in the future raise return self._crawluri_from_uri(next_uri) def close(self): """ Close the underlying frontend queues. """ self._front_end_queues.checkpoint() self._front_end_queues.close() def _crawl_now(self, uri): """ Convinience method for crawling an uri right away. """ self._add_to_heap(uri, 3000) def _add_to_heap(self, uri, next_date): """ Add an URI to the heap that is ready to be crawled. """ self._heap.put_nowait((next_date, uri)) (url, _etag, _mod_date, _next_date, _prio) = uri self._current_uris[url] = uri self._logger.debug("frontier::Adding '%s' to the heap" % url) def _reschedule_uri(self, curi): """ Return the `next_crawl_date` for :class:`CrawlUri`s. """ (prio, delta) = self._prioritizer.calculate_priority(curi) now = datetime.now(self._timezone) return (prio, time.mktime((now + delta).timetuple())) def _ignore_uri(self, curi): """ Ignore a :class:`CrawlUri` from now on. """ self._front_end_queues.ignore_uri(curi.url, curi.status_code) def _uri_from_curi(self, curi): """ Create the uri tuple from the :class:`CrawlUri` and calculate the priority. Overwrite this method in more specific frontiers. """ etag = mod_date = None if curi.rep_header: if "Etag" in curi.rep_header: etag = curi.rep_header["Etag"] if "Last-Modified" in curi.rep_header: mod_date = time.mktime( deserialize_date_time( curi.rep_header["Last-Modified"]).timetuple()) if not mod_date and 'Date' in curi.rep_header: mod_date = time.mktime( deserialize_date_time(curi.rep_header["Date"]).timetuple()) if mod_date: # only reschedule if it has been crawled before (prio, next_crawl_date) = self._reschedule_uri(curi) else: (prio, next_crawl_date) = (1, time.mktime( datetime.now(self._timezone).timetuple())) return (curi.url, etag, mod_date, next_crawl_date, prio) def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi def _update_heap(self): """ Abstract method. Implement this in the actual Frontier. The implementation should really only add uris to the heap if they can be downloaded right away. """ pass def _maybe_checkpoint(self, force_checkpoint=False): """ Periodically checkpoint the state db. """ self._uris_added += 1 if self._uris_added > self._checkpoint_interval or force_checkpoint: self._front_end_queues.checkpoint() self._uris_added = 0 def process_successful_crawl(self, curi): """ Called when an URI has been crawled successfully. `curi` is a :class:`CrawlUri` """ self.update_uri(curi) if curi.optional_vars and CURI_EXTRACTED_URLS in curi.optional_vars: for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"): if len(url) > 5 and not self._unique_uri.is_known(url): self.add_uri(CrawlUri(url)) del self._current_uris[curi.url] for sink in self._sinks: sink.process_successful_crawl(curi) def process_not_found(self, curi): """ Called when an URL was not found. This could mean, that the URL has been removed from the server. If so, do something about it! Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_not_found(curi) def process_redirect(self, curi): """ Called when there were too many redirects for an URL, or the site has note been updated since the last visit. In the latter case, update the internal uri and increase the priority level. """ del self._current_uris[curi.url] if curi.status_code in [301, 302]: # simply ignore the URL. The URL that is being redirected to is # extracted and added in the processing self._ignore_uri(curi) if curi.status_code == 304: # the page has not been modified since the last visit! Update it # NOTE: prio increasing happens in the prioritizer self.update_uri(curi) for sink in self._sinks: sink.process_redirect(curi) def process_server_error(self, curi): """ Called when there was some kind of server error. Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_server_error(curi)