Пример #1
0
    def extract(cls, html, html_formated):

        potential_titles = []
        soup = BeautifulSoup(html, 'html.parser')

        if soup.title:
            page_title = TitleExtractor.extract_text(soup.title)

            for split_char in TitleExtractor.SPLIT_CHARS:
                if split_char in page_title:
                    page_title = page_title.split(split_char)[0].strip()

            potential_titles.append(page_title)

        for heading_tag in (soup.find_all('h1') + soup.find_all('h2')):
            potential_title = TitleExtractor.extract_text(heading_tag)
            if potential_title:
                potential_titles.append(potential_title)

        # Extract article from goose
        article = Article()
        article.raw_html = html
        article.raw_doc = html_formated
        article.doc = article.raw_doc
        try:
            goose_title = TitleExtractorGoose(Configuration(),
                                              article).get_title()
        except AttributeError, e:
            goose_title = None
Пример #2
0
    def extract(cls, html, html_formated):

        potential_titles = []
        soup = BeautifulSoup(html, 'html.parser')

        if soup.title:
            page_title = TitleExtractor.extract_text(soup.title)

            for split_char in TitleExtractor.SPLIT_CHARS:
                if split_char in page_title:
                    page_title = page_title.split(split_char)[0].strip()

            potential_titles.append(page_title)

        for heading_tag in (soup.find_all('h1') + soup.find_all('h2')):
            potential_title = TitleExtractor.extract_text(heading_tag)
            if potential_title:
                potential_titles.append(potential_title)

        # Extract article from goose
        article = Article()
        article.raw_html = html
        article.raw_doc = html_formated
        article.doc = article.raw_doc
        try:
            goose_title = TitleExtractorGoose(Configuration(), article).get_title()
        except AttributeError, e:
            goose_title = None
Пример #3
0
    def __init__(self, config):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # image extrator
        self.image_extractor = self.get_image_extractor()

        # html fetcher
        self.htmlfetcher = HtmlFetcher(self.config)

        # TODO : log prefix
        self.logPrefix = "crawler:"
Пример #4
0
    def _goose_cleaned_text(cls, html, page_html):
        article = Article()
        article.raw_html = html
        article.raw_doc = page_html
        article.doc = article.raw_doc

        goose_extractor = ContentExtractor(Configuration(), article)
        goose_cleaner = DocumentCleaner(Configuration(), article)
        goose_formatter = OutputFormatter(Configuration(), article)
        # goose_image_extractor = ImageExtractor(Configuration(), article) use

        try:
            article.doc = goose_cleaner.clean()
            article.top_node = goose_extractor.calculate_best_node()
            if article.top_node is not None:
                article.top_node = goose_extractor.post_cleanup()
                article.cleaned_text = goose_formatter.get_formatted_text()
        except UnicodeDecodeError, e:
            article.top_node = None
    def __init__(self, config):
        # config
        self.config = config
        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = Article()

        # init the extractor
        self.extractor = self.get_extractor()

        # init the document cleaner
        self.cleaner = self.get_cleaner()

        # init the output formatter
        self.formatter = self.get_formatter()

        # metas extractor
        self.metas_extractor = self.get_metas_extractor()

        # publishdate extractor
        self.publishdate_extractor = self.get_publishdate_extractor()

        # opengraph extractor
        self.opengraph_extractor = self.get_opengraph_extractor()

        # tags extractor
        self.tags_extractor = self.get_tags_extractor()

        # authors extractor
        self.authors_extractor = self.get_authors_extractor()

        # tweets extractor
        self.tweets_extractor = self.get_tweets_extractor()

        # links extractor
        self.links_extractor = self.get_links_extractor()

        # video extractor
        self.video_extractor = self.get_video_extractor()

        # title extractor
        self.title_extractor = self.get_title_extractor()

        # image extrator
        self.image_extractor = self.get_image_extractor()

        # html fetcher
        self.htmlfetcher = HtmlFetcher(self.config)

        # TODO : log prefix
        self.logPrefix = "crawler:"
Пример #6
0
    def _goose_cleaned_text(cls, html, page_html):
        article = Article()
        article.raw_html = html
        article.raw_doc = page_html
        article.doc = article.raw_doc

        goose_extractor = ContentExtractor(Configuration(), article)
        goose_cleaner = DocumentCleaner(Configuration(), article)
        goose_formatter = OutputFormatter(Configuration(), article)
        # goose_image_extractor = ImageExtractor(Configuration(), article) use

        try:
            article.doc = goose_cleaner.clean()
            article.top_node = goose_extractor.calculate_best_node()
            if article.top_node is not None:
                article.top_node = goose_extractor.post_cleanup()
                article.cleaned_text = goose_formatter.get_formatted_text()
        except UnicodeDecodeError, e:
            article.top_node = None
Пример #7
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article

        doc = self.get_document(raw_html)

        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)

        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(article.raw_doc, article.top_node)
            # post cleanup
            article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)

        # cleanup tmp file
        self.relase_resources(article)

        # extract video info
        video_info = self.get_video_info_extractor(article).get_video_info()
        if video_info:
            article.additional_data['video_info'] = video_info

        return article
Пример #8
0
 def test_instance(self):
     a = Article()
     self.assertEqual(isinstance(a, Article), True)
Пример #9
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article
        charset = get_charset(raw_html)
        raw_html = raw_html.decode(charset, 'ignore')
        pattern = re.compile("[\u4e00-\u9fa5]")
        if 'GB2312 GBK GB18030'.find(charset.upper()) != -1 \
                or pattern.search(raw_html) is not None:
            self.config.stopwords_class = StopWordsChinese
            print("中文")
        raw_html = clean_tags(raw_html, ['SOHUADCODE', 'script', 'style'])
        if charset != 'utf-8':
            raw_html = replace_meta_charset(raw_html)
        raw_html = force_meta(raw_html)
        doc = self.get_document(parse_candidate.url, raw_html)
        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)
        # import lxml.html
        # lxml.html.open_in_browser(article.doc)
        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is None:
            article.top_node = doc
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(
                    article.raw_doc, article.top_node)
            # post cleanup
            # article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)
            # import lxml.html
            # lxml.html.open_in_browser(article.top_node)
            # article.cleaned_text = self.parser.nodeToString(article.top_node)
            if article.meta_description is None:
                article.meta_description = text_content(
                    article.cleaned_text)[:150]
        # cleanup tmp file
        self.relase_resources(article)

        return article
Пример #10
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article

        doc = self.get_document(raw_html)

        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)

        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(
                    article.raw_doc, article.top_node)
            # post cleanup
            article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)

        # cleanup tmp file
        self.release_resources(article)

        return article
Пример #11
0
    def crawl(self, crawl_candidate):
        article = Article()

        parse_candidate = self.get_parse_candidate(crawl_candidate)
        raw_html = self.get_html(crawl_candidate, parse_candidate)

        if raw_html is None:
            return article
        charset = get_charset(raw_html)
        raw_html = raw_html.decode(charset, 'ignore')
        pattern = re.compile("[\u4e00-\u9fa5]")
        if 'GB2312 GBK GB18030'.find(charset.upper()) != -1 \
                or pattern.search(raw_html) is not None:
            self.config.stopwords_class = StopWordsChinese
            print("中文")
        raw_html = clean_tags(raw_html, ['SOHUADCODE', 'script', 'style'])
        if charset != 'utf-8':
            raw_html = replace_meta_charset(raw_html)
        raw_html = force_meta(raw_html)
        doc = self.get_document(parse_candidate.url, raw_html)
        extractor = self.get_extractor()
        document_cleaner = self.get_document_cleaner()
        output_formatter = self.get_output_formatter()

        # article
        article.final_url = parse_candidate.url
        article.link_hash = parse_candidate.link_hash
        article.raw_html = raw_html
        article.doc = doc
        article.raw_doc = deepcopy(doc)
        article.title = extractor.get_title(article)
        # TODO
        # article.publish_date = config.publishDateExtractor.extract(doc)
        # article.additional_data = config.get_additionaldata_extractor.extract(doc)
        article.meta_lang = extractor.get_meta_lang(article)
        article.meta_favicon = extractor.get_favicon(article)
        article.meta_description = extractor.get_meta_description(article)
        article.meta_keywords = extractor.get_meta_keywords(article)
        article.canonical_link = extractor.get_canonical_link(article)
        article.domain = extractor.get_domain(article.final_url)
        article.tags = extractor.extract_tags(article)
        # before we do any calcs on the body itself let's clean up the document
        article.doc = document_cleaner.clean(article)
        # import lxml.html
        # lxml.html.open_in_browser(article.doc)
        # big stuff
        article.top_node = extractor.calculate_best_node(article)
        if article.top_node is None:
            article.top_node = doc
        if article.top_node is not None:
            # video handeling
            video_extractor = self.get_video_extractor(article)
            video_extractor.get_videos()
            # image handeling
            if self.config.enable_image_fetching:
                image_extractor = self.get_image_extractor(article)
                article.top_image = image_extractor.get_best_image(
                    article.raw_doc, article.top_node)
            # post cleanup
            # article.top_node = extractor.post_cleanup(article.top_node)
            # clean_text
            article.cleaned_text = output_formatter.get_formatted_text(article)
            # import lxml.html
            # lxml.html.open_in_browser(article.top_node)
            # article.cleaned_text = self.parser.nodeToString(article.top_node)
            if article.meta_description is None:
                article.meta_description = text_content(
                    article.cleaned_text)[:150]
        # cleanup tmp file
        self.relase_resources(article)

        return article