def __init__(self, name=None, inbox=None, outbox=None, concurrency=1, settings=None): Processor.__init__(self, name, inbox, outbox, concurrency, settings) self.urltracker = UrlTracker()
class UrlProcessor(Processor): """The processor fetches content from network and dump it to the local file system. """ def __init__(self, name=None, inbox=None, outbox=None, concurrency=1, settings=None): Processor.__init__(self, name, inbox, outbox, concurrency, settings) self.urltracker = UrlTracker() def precheck(self, material): ''' Check the material (mainly its url) before processing it. It will return True default, or False if: 1) the depth of the url exceeds the settings.max_depth; 2) the url has already been fetched; 3) the interval after the last fetching from the same network location (the processor would resubmit the material after a proper delay on this situation). ''' url, depth = material.url, material.depth # check depth if depth > self.settings.max_depth: return False # check is_fetched, to avoid repeated fetching if self.urltracker.is_tracked(url): return False # check crawl interval netloc_time = self.urltracker.netloc_time(url) timedelta = int(time.time()) - netloc_time if timedelta < self.settings.crawl_interval: self.delay(material, timedelta) return False self.urltracker.track(url, depth) return True def process(self, material): """Fetch the content from a url in the material, dump it to a file, and return a product with the file name. After fetching the url, if the content mimetype is: 1) text/html, the processor dump the content and return the product with the path to the dumpped file. 2) not text/html, but match the target_url pattern, then dump the content and return None Args: material: a UrlMaterial to be processed Returns: a UrlProduct """ logging.info("processing material %s", material) # settings dumpdir = self.settings.output_directory target_url = self.settings.target_url crawl_timeout = self.settings.crawl_timeout url, depth = material.url, material.depth # checking the material if not self.precheck(material): return None # fetch the content fetched = urlfetch(url, crawl_timeout) if fetched is None: return None # dump the content content, encoding, mimetype = fetched if mimetype.startswith('text/html'): filepath = dump_url_content(url, content, dumpdir) return UrlProduct(processor=self.name, url=url, encoding=encoding, depth=depth, filepath=filepath) elif re.match(target_url, url): dump_url_content(url, content, dumpdir) return None else: return None