Exemplo n.º 1
0
class Goose(object):
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def close(self):
        ''' close the connection and any other cleanup required '''
        if self.fetcher != None:
            self.shutdown_network()
        self.finalizer.atexit = False  # turn off the garbage collection close

    def extract(self, url=None, raw_html=None):
        """
        Main method to extract an article object from a URL,
        pass in a url and get back a Article
        """
        crawl_candidate = CrawlCandidate(self.config, url, raw_html)
        return self.crawl(crawl_candidate)

    def shutdown_network(self):
        ''' ensure the connection is closed '''
        self.fetcher.close()
        self.fetcher = None

    def crawl(self, crawl_candidate):
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        try:
            crawler = Crawler(self.config, self.fetcher)
            article = crawler.crawl(crawl_candidate)
        except (UnicodeDecodeError, ValueError) as ex:
            if parsers:
                self.config.parser_class = parsers[0]
                return self.crawl(crawl_candidate)
            else:
                raise ex
        return article
Exemplo n.º 2
0
class Goose(object):
    ''' Extract most likely article content and aditional metadata from a URL
        or previously fetched HTML document

        Args:
            config (Configuration, dict): A configuration file or dictionary \
            representation of the configuration file
        Returns:
            Goose: An instance of the goose extraction object '''
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)

    def __enter__(self):
        ''' Setup the context manager '''
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        ''' Define what to do when the context manager exits '''
        self.close()

    def close(self):
        ''' Close the network connection and perform any other required cleanup

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        if self.fetcher is not None:
            self.shutdown_network()
        self.finalizer.atexit = False  # turn off the garbage collection close

    def extract(self, url=None, raw_html=None):
        ''' Extract the most likely article content from the html page

            Args:
                url (str): URL to pull and parse
                raw_html (str): String representation of the HTML page
            Returns:
                Article: Representation of the article contents \
                including other parsed and extracted metadata '''
        crawl_candidate = CrawlCandidate(self.config, url, raw_html)
        return self.__crawl(crawl_candidate)

    def shutdown_network(self):
        ''' Close the network connection

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        self.fetcher.close()
        self.fetcher = None

    def __crawl(self, crawl_candidate):
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        try:
            crawler = Crawler(self.config, self.fetcher)
            article = crawler.crawl(crawl_candidate)
        except (UnicodeDecodeError, ValueError) as ex:
            if parsers:
                self.config.parser_class = parsers[0]
                return self.__crawl(crawl_candidate)
            else:
                raise ex
        return article