Python NetworkFetcher примеры использования

Язык программирования: Python

Пространство имен/Пакет: cdspider.parser.lib.goose3.network

Класс/Тип: NetworkFetcher

Примеров на hotexamples.com: 7

Python NetworkFetcher - 7 примеров найдено. Это лучшие примеры Python кода для cdspider.parser.lib.goose3.network.NetworkFetcher, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

NetworkFetcher(3)

fetch(3)

close(1)

Основные методы

NetworkFetcher (3)

fetch (3)

close (1)

Пример #1

Показать файл

Файл: crawler.py Проект: loeyae/lspider

    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # catalogue
        self.catalogue = Catalogue()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # TODO: use the log prefix
        self.log_prefix = "urlcrawler: "

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        self.extractor = self.get_extractor()

Пример #2

Показать файл

Файл: __init__.py Проект: loeyae/lspider

    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)

Пример #3

Показать файл

Файл: crawler.py Проект: loeyae/lspider

    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # reportage news article extractor
        self.reportagenewsarticle_extractor = self.get_reportagenewsarticle_extractor(
        )

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # attach extractor
        self.attach_extractor = self.get_attach_extractor()

        # image extractor
        self.image_extractor = self.get_image_extractor()

        self.custom_extractor = self.get_custom_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "

Пример #4

Показать файл

Файл: crawler.py Проект: loeyae/lspider

class Crawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # reportage news article extractor
        self.reportagenewsarticle_extractor = self.get_reportagenewsarticle_extractor(
        )

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # attach extractor
        self.attach_extractor = self.get_attach_extractor()

        # image extractor
        self.image_extractor = self.get_image_extractor()

        self.custom_extractor = self.get_custom_extractor()

        # TODO: use the log prefix
        self.log_prefix = "crawler: "

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.article

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash,
                            parse_candidate.encoding)

    def process(self, raw_html, final_url, link_hash, encoding=None):

        # create document
        doc = self.get_document(raw_html, encoding)

        # article
        self.article._final_url = final_url or self.config.final_url
        self.article._link_hash = link_hash
        self.article._raw_html = raw_html
        self.article._doc = doc
        self.article._raw_doc = deepcopy(doc)

        # open graph
        self.article._opengraph = self.opengraph_extractor.extract()

        # schema (ReportageNewsArticle) https://pending.schema.org/ReportageNewsArticle
        self.article._schema = self.reportagenewsarticle_extractor.extract()

        if not self.article._final_url:
            if "url" in self.article.opengraph:
                self.article._final_url = self.article.opengraph["url"]
            elif self.article.schema and "url" in self.article.schema:
                self.article._final_url = self.article.schema["url"]

        # meta
        metas = self.metas_extractor.extract()
        # print(metas)
        self.article._meta_lang = metas['lang']
        self.article._meta_favicon = metas['favicon']
        self.article._meta_description = metas['description']
        self.article._meta_keywords = metas['keywords']
        self.article._meta_encoding = metas['encoding']
        self.article._canonical_link = metas['canonical']
        self.article._domain = metas['domain']

        # tags
        self.article._tags = self.tags_extractor.extract()

        # authors
        self.article._authors = self.authors_extractor.extract()

        # title
        self.article._title = self.title_extractor.extract()

        self.article._attaches = self.attach_extractor.extract()

        for k in self.config.custom_rule:
            if k not in ('title', 'author', 'pubtime', 'content', 'attaches'):
                self.article.add_additional_data(
                    k, self.custom_extractor.extract(k))

        # check for known node as content body
        # if we find one force the article.doc to be the found node
        # this will prevent the cleaner to remove unwanted text content
        article_body = self.extractor.get_known_article_tags()
        if article_body is not None:
            doc = article_body

        # before we do any calcs on the body itself let's clean up the document
        if not isinstance(doc, list):
            doc = [self.cleaner.clean(doc)]
        else:
            doc = [self.cleaner.clean(deepcopy(x)) for x in doc]

        # big stuff
        self.article._top_node = self.extractor.custom_top_node(doc)

        if self.article._top_node is not None:
            self.article._doc = doc

            # publishdate
            self.article._publish_date = self.publishdate_extractor.extract()

            # article links
            self.article._links = self.links_extractor.extract()

            # tweets
            self.article._tweets = self.tweets_extractor.extract()

            # video handling
            self.article._movies = self.video_extractor.get_videos()

            # image handling
            if self.config.enable_image_fetching:
                self.get_image()

            self.article._top_node_html = self.parser.outerHtml(
                self.article._top_node)

            # clean_text
            self.article._cleaned_text = self.formatter.get_formatted_text()
        else:
            self.article._top_node = self.extractor.calculate_best_node(
                self.article._doc)

            # publishdate
            self.article._publish_date = self.publishdate_extractor.extract()

            # if we have a top node
            # let's process it
            if self.article._top_node is not None:

                # article links
                self.article._links = self.links_extractor.extract()

                # tweets
                self.article._tweets = self.tweets_extractor.extract()

                # video handling
                self.article._movies = self.video_extractor.get_videos()

                # image handling
                if self.config.enable_image_fetching:
                    self.get_image()

                # post cleanup
                self.article._top_node = self.extractor.post_cleanup()

                self.article._top_node_html = self.parser.outerHtml(
                    self.article._top_node)

                # clean_text
                self.article._cleaned_text = self.formatter.get_formatted_text(
                )

        if not self.article._cleaned_text:
            self.article._cleaned_text = self.extractor.extract()

        # cleanup tmp file
        self.release_resources()

        # return the article
        return self.article

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html,
                                                   crawl_candidate.encoding)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_image(self):
        doc = self.article.raw_doc
        top_node = self.article.top_node
        self.article._top_image = self.image_extractor.get_best_image(
            doc, top_node)

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        html = self.fetcher.fetch(parsing_candidate.url)
        return html

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.article)

    def get_publishdate_extractor(self):
        return PublishDateExtractor(self.config, self.article)

    def get_opengraph_extractor(self):
        return OpenGraphExtractor(self.config, self.article)

    def get_reportagenewsarticle_extractor(self):
        return ReportageNewsArticleExtractor(self.config, self.article)

    def get_tags_extractor(self):
        return TagsExtractor(self.config, self.article)

    def get_authors_extractor(self):
        return AuthorsExtractor(self.config, self.article)

    def get_attach_extractor(self):
        return AttachExtractor(self.fetcher, self.config, self.article)

    def get_tweets_extractor(self):
        return TweetsExtractor(self.config, self.article)

    def get_links_extractor(self):
        return LinksExtractor(self.config, self.article)

    def get_title_extractor(self):
        return TitleExtractor(self.config, self.article)

    def get_image_extractor(self):
        return ImageExtractor(self.fetcher, self.config, self.article)

    def get_video_extractor(self):
        return VideoExtractor(self.config, self.article)

    def get_formatter(self):
        return StandardOutputFormatter(self.config, self.article)

    def get_cleaner(self):
        return StandardDocumentCleaner(self.config, self.article)

    def get_document(self, raw_html, encoding=None):
        doc = self.parser.fromstring(raw_html, encoding)
        return doc

    def get_extractor(self):
        return StandardContentExtractor(self.config, self.article)

    def get_custom_extractor(self):
        return CustomExtractor(self.config, self.article)

    def release_resources(self):
        if not self.config.local_storage_path:
            return
        path = os.path.join(self.config.local_storage_path,
                            '%s_*' % self.article.link_hash)
        for fname in glob.glob(path):
            try:
                os.remove(fname)
            except OSError:
                # TODO: better log handeling
                pass

Пример #5

Показать файл

Файл: crawler.py Проект: loeyae/lspider

class CustomCrawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # catalogue
        self.catalogue = Catalogue()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # TODO: use the log prefix
        self.log_prefix = "urlcrawler: "

        self.extractor = self.get_extractor()

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.catalogue

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash,
                            parse_candidate.encoding)

    def get_document(self, raw_html, encoding=None):
        doc = self.parser.fromstring(raw_html, encoding)
        return doc

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        html = self.fetcher.fetch(parsing_candidate.url)
        return html

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html,
                                                   crawl_candidate.encoding)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_extractor(self):
        return CustomExtractor(self.config, self.catalogue)

    def process(self, raw_html, final_url, link_hash, encoding=None):

        # create document
        doc = self.get_document(raw_html, encoding)

        # catalogue
        self.catalogue._final_url = final_url or self.config.final_url
        self.catalogue._link_hash = link_hash
        self.catalogue._raw_html = raw_html
        self.catalogue._doc = doc
        self.catalogue._raw_doc = deepcopy(doc)

        custom_rule = self.config.custom_rule
        if custom_rule:
            data = {}
            onlyOne = custom_rule.pop('onlyOne', 1)
            if 'item' in custom_rule and custom_rule['item']:
                if 'filter' in custom_rule and custom_rule['filter']:
                    doc = self.extractor.custom_match_elements(
                        custom_rule['filter'], doc=doc)
                self.catalogue._doc = doc
                for key, rule in custom_rule['item'].items():
                    parsed = self.extractor.extract(key, rule, onlyOne)
                    parsed = utils.patch_result(parsed, rule)
                    parsed = utils.extract_result(parsed, rule)
                    data[key] = [parsed
                                 ] if not isinstance(parsed, list) else parsed
                self.catalogue.data = utils.table2kvlist(data)
            else:
                for key, rule in custom_rule.items():
                    parsed = self.extractor.extract(key, rule)
                    parsed = utils.patch_result(parsed, rule)
                    parsed = utils.extract_result(parsed, rule)
                    data[key] = parsed
                self.catalogue.data = [data]
        return self.catalogue

Пример #6

Показать файл

Файл: crawler.py Проект: loeyae/lspider

class CatalogueCrawler(object):
    def __init__(self, config, fetcher=None):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # catalogue
        self.catalogue = Catalogue()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # html fetcher
        if isinstance(fetcher, NetworkFetcher):
            self.fetcher = fetcher
        else:
            self.fetcher = NetworkFetcher(self.config)

        # TODO: use the log prefix
        self.log_prefix = "urlcrawler: "

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        self.extractor = self.get_extractor()

    def crawl(self, crawl_candidate):

        # parser candidate
        parse_candidate = self.get_parse_candidate(crawl_candidate)

        # raw html
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return self.catalogue

        return self.process(raw_html, parse_candidate.url,
                            parse_candidate.link_hash,
                            parse_candidate.encoding)

    def get_document(self, raw_html, encoding=None):
        doc = self.parser.fromstring(raw_html, encoding)
        return doc

    def get_html(self, crawl_candidate, parsing_candidate):
        # we got a raw_tml
        # no need to fetch remote content
        if crawl_candidate.raw_html:
            return crawl_candidate.raw_html

        # fetch HTML
        html = self.fetcher.fetch(parsing_candidate.url)
        return html

    @staticmethod
    def get_parse_candidate(crawl_candidate):
        if crawl_candidate.raw_html:
            return RawHelper.get_parsing_candidate(crawl_candidate.url,
                                                   crawl_candidate.raw_html,
                                                   crawl_candidate.encoding)
        return URLHelper.get_parsing_candidate(crawl_candidate.url)

    def get_metas_extractor(self):
        return MetasExtractor(self.config, self.catalogue)

    def get_extractor(self):
        return CatalogueExtractor(self.config, self.catalogue)

    def process(self, raw_html, final_url, link_hash, encoding=None):

        # create document
        doc = self.get_document(raw_html, encoding)

        # catalogue
        self.catalogue._final_url = final_url or self.config.final_url
        self.catalogue._link_hash = link_hash
        self.catalogue._raw_html = raw_html
        self.catalogue._doc = doc
        self.catalogue._raw_doc = deepcopy(doc)

        metas = self.metas_extractor.extract()
        self.catalogue._meta_lang = metas['lang']
        self.catalogue._meta_favicon = metas['favicon']
        self.catalogue._meta_description = metas['description']
        self.catalogue._meta_keywords = metas['keywords']
        self.catalogue._canonical_link = metas['canonical']
        self.catalogue._domain = metas['domain']

        self.catalogue.data = self.extractor.extract()

        return self.catalogue

Пример #7

Показать файл

Файл: __init__.py Проект: loeyae/lspider

class Goose(object):
    ''' Extract most likely article content and aditional metadata from a URL
        or previously fetched HTML document

        Args:
            config (Configuration, dict): A configuration file or dictionary \
            representation of the configuration file
        Returns:
            Goose: An instance of the goose extraction object '''
    def __init__(self, config=None):
        # Use the passed in configuration if it is of the right type, otherwise
        # use the default as a base
        if isinstance(config, Configuration):
            self.config = config
        else:
            self.config = Configuration()

        # if config was a passed in dict, parse it into the stored configuration
        if isinstance(config, dict):
            for k, v in list(config.items()):
                if hasattr(self.config, k):
                    setattr(self.config, k, v)

        # setup a single network connection
        self.fetcher = NetworkFetcher(self.config)
        self.finalizer = weakref.finalize(self, self.close)

        # we don't need to go further if image extractor or local_storage is not set
        if not self.config.local_storage_path or not self.config.enable_image_fetching:
            return

        # test if config.local_storage_path is a directory
        if not os.path.isdir(self.config.local_storage_path):
            os.makedirs(self.config.local_storage_path)

        if not os.path.isdir(self.config.local_storage_path):
            msg = (
                '{} directory does not seem to exist, you need to set this for '
                'image processing downloads').format(
                    self.config.local_storage_path)
            raise Exception(msg)

        # test to write a dummy file to the directory to check is directory is writable
        level, path = mkstemp(dir=self.config.local_storage_path)
        try:
            with os.fdopen(level, "w"):
                pass
            os.remove(path)
        except IOError:
            msg = (
                '{} directory is not writeble, you need to set this for image '
                'processing downloads').format(self.config.local_storage_path)
            raise Exception(msg)

    def __enter__(self):
        ''' Setup the context manager '''
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        ''' Define what to do when the context manager exits '''
        self.close()

    def close(self):
        ''' Close the network connection and perform any other required cleanup

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        if self.fetcher is not None:
            self.shutdown_network()
        self.finalizer.atexit = False  # turn off the garbage collection close

    def extract(self, url=None, raw_html=None, encoding=None):
        ''' Extract the most likely article content from the html page

            Args:
                url (str): URL to pull and parse
                raw_html (str): String representation of the HTML page
            Returns:
                Article: Representation of the article contents \
                including other parsed and extracted metadata '''
        crawl_candidate = CrawlCandidate(self.config, url, raw_html, encoding)
        return self.__crawl(crawl_candidate)

    def fetch(self, url=None, raw_html=None, encoding=None):
        crawl_candidate = CrawlCandidate(self.config, url, raw_html, encoding)
        return self.__fetch(crawl_candidate)

    def parse(self, url=None, raw_html=None, encoding=None):
        crawl_candidate = CrawlCandidate(self.config, url, raw_html, encoding)
        return self.__parse(crawl_candidate)

    def shutdown_network(self):
        ''' Close the network connection

            Note:
                Auto closed when using goose as a context manager or when garbage collected '''
        self.fetcher.close()
        self.fetcher = None

    def __crawl(self, crawl_candidate):
        ''' wrap the crawling functionality '''
        def crawler_wrapper(parser, parsers_lst, crawl_candidate):
            try:
                crawler = Crawler(self.config, self.fetcher)
                article = crawler.crawl(crawl_candidate)
            except (UnicodeDecodeError, ValueError) as ex:
                if parsers_lst:
                    parser = parsers_lst.pop(0)  # remove it also!
                    return crawler_wrapper(parser, parsers_lst,
                                           crawl_candidate)
                else:
                    raise ex
            return article

        # use the wrapper
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        return crawler_wrapper(self.config.parser_class, parsers,
                               crawl_candidate)

    def __fetch(self, crawl_candidate):
        def crawler_wrapper(parser, parsers_lst, crawl_candidate):
            try:
                crawler = CatalogueCrawler(self.config, self.fetcher)
                article = crawler.crawl(crawl_candidate)
            except (UnicodeDecodeError, ValueError) as ex:
                if parsers_lst:
                    parser = parsers_lst.pop(0)  # remove it also!
                    return crawler_wrapper(parser, parsers_lst,
                                           crawl_candidate)
                else:
                    raise ex
            return article

        ''' use the wrapper '''
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        return crawler_wrapper(self.config.parser_class, parsers,
                               crawl_candidate)

    def __parse(self, crawl_candidate):
        def crawler_wrapper(parser, parsers_lst, crawl_candidate):
            try:
                crawler = CustomCrawler(self.config, self.fetcher)
                article = crawler.crawl(crawl_candidate)
            except (UnicodeDecodeError, ValueError) as ex:
                if parsers_lst:
                    parser = parsers_lst.pop(0)  # remove it also!
                    return crawler_wrapper(parser, parsers_lst,
                                           crawl_candidate)
                else:
                    raise ex
            return article

        ''' use the wrapper '''
        parsers = list(self.config.available_parsers)
        parsers.remove(self.config.parser_class)
        return crawler_wrapper(self.config.parser_class, parsers,
                               crawl_candidate)