Пример #1
0
 def __init__(self, name=None, inbox=None, outbox=None, concurrency=1, settings=None):
     Processor.__init__(self, name, inbox, outbox, concurrency, settings)
     self.urltracker = UrlTracker()
Пример #2
0
class UrlProcessor(Processor):
    """The processor fetches content from network and dump it to the local
    file system.
    """

    def __init__(self, name=None, inbox=None, outbox=None, concurrency=1, settings=None):
        Processor.__init__(self, name, inbox, outbox, concurrency, settings)
        self.urltracker = UrlTracker()

    def precheck(self, material):
        ''' Check the material (mainly its url) before processing it.

        It will return True default, or False if:
            1) the depth of the url exceeds the settings.max_depth;
            2) the url has already been fetched;
            3) the interval after the last fetching from the same network
               location (the processor would resubmit the material after a
               proper delay on this situation).
        '''
        url, depth = material.url, material.depth
        # check depth
        if depth > self.settings.max_depth:
            return False
        # check is_fetched, to avoid repeated fetching
        if self.urltracker.is_tracked(url):
            return False
        # check crawl interval
        netloc_time = self.urltracker.netloc_time(url)
        timedelta = int(time.time()) - netloc_time
        if timedelta < self.settings.crawl_interval:
            self.delay(material, timedelta)
            return False
        self.urltracker.track(url, depth)
        return True

    def process(self, material):
        """Fetch the content from a url in the material, dump it to a file,
        and return a product with the file name.

        After fetching the url, if the content mimetype is:
            1) text/html, the processor dump the content and return the product
               with the path to the dumpped file.
            2) not text/html, but match the target_url pattern, then dump the
               content and return None

        Args:
            material: a UrlMaterial to be processed

        Returns:
            a UrlProduct
        """
        logging.info("processing material %s", material)
        # settings
        dumpdir = self.settings.output_directory
        target_url = self.settings.target_url
        crawl_timeout = self.settings.crawl_timeout

        url, depth = material.url, material.depth

        # checking the material
        if not self.precheck(material):
            return None

        # fetch the content
        fetched = urlfetch(url, crawl_timeout)
        if fetched is None:
            return None

        # dump the content
        content, encoding, mimetype = fetched
        if mimetype.startswith('text/html'):
            filepath = dump_url_content(url, content, dumpdir)
            return UrlProduct(processor=self.name, url=url,
                              encoding=encoding, depth=depth,
                              filepath=filepath)
        elif re.match(target_url, url):
            dump_url_content(url, content, dumpdir)
            return None
        else:
            return None