Пример #1
0
def clean(html_content):
    config = Configuration()
    config.enable_image_fetching = False
    extractor = Goose(config=config)

    article = extractor.extract(raw_html=html_content)

    return article.cleaned_text
Пример #2
0
 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     if language:
         config.target_language = language
         config.use_meta_language = False
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(raw_html=raw_html)
     return article
Пример #3
0
 def extend_config(self):
     if isinstance(self.config, dict):
         config = Configuration()
         for k, v in self.config.items():
             if hasattr(config, k):
                 setattr(config, k, v)
         self.config = config
Пример #4
0
    def extract(cls, html, html_formated):

        potential_titles = []
        soup = BeautifulSoup(html, 'html.parser')

        if soup.title:
            page_title = TitleExtractor.extract_text(soup.title)

            for split_char in TitleExtractor.SPLIT_CHARS:
                if split_char in page_title:
                    page_title = page_title.split(split_char)[0].strip()

            potential_titles.append(page_title)

        for heading_tag in (soup.find_all('h1') + soup.find_all('h2')):
            potential_title = TitleExtractor.extract_text(heading_tag)
            if potential_title:
                potential_titles.append(potential_title)

        # Extract article from goose
        article = Article()
        article.raw_html = html
        article.raw_doc = html_formated
        article.doc = article.raw_doc
        try:
            goose_title = TitleExtractorGoose(Configuration(),
                                              article).get_title()
        except AttributeError, e:
            goose_title = None
Пример #5
0
    def _goose_cleaned_text(cls, html, page_html):
        article = Article()
        article.raw_html = html
        article.raw_doc = page_html
        article.doc = article.raw_doc

        goose_extractor = ContentExtractor(Configuration(), article)
        goose_cleaner = DocumentCleaner(Configuration(), article)
        goose_formatter = OutputFormatter(Configuration(), article)
        # goose_image_extractor = ImageExtractor(Configuration(), article) use

        article.doc = goose_cleaner.clean()
        article.top_node = goose_extractor.calculate_best_node()

        if article.top_node is not None:
            article.top_node = goose_extractor.post_cleanup()
            article.cleaned_text = goose_formatter.get_formatted_text()

        return article.cleaned_text
Пример #6
0
 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = False
     return config
Пример #7
0
 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(url=url, raw_html=raw_html)
     return article
Пример #8
0
 def __init__(self, config=None):
     self.config = config or Configuration()
     self.extend_config()
     self.initialize()
Пример #9
0
 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = True
     return config
Пример #10
0
 def test_tmp_not_overwritten(self):
     path = '/this/directory/does/not/exist/i/assume/'
     config = Configuration()
     self.assertRaises(AttributeError, lambda: setattr(config, 'local_storage_path', path))
Пример #11
0
 def getConfig(self):
     config = Configuration()
     config.stopwords_class = StopWordsArabic
     return config
Пример #12
0
 def getConfig(self):
     config = Configuration()
     config.stopwords_class = StopWordsChinese
     return config
Пример #13
0
import os,sys
import time
import subprocess
import signal
from httplib import IncompleteRead
from gevent.pool import Pool
import gevent.socket as socket
from gevent.event import Event
from goose import Goose
from goose.configuration import Configuration
from goose.text import StopWordsChinese
import chardet
import random

goose_config = Configuration()
goose_config.enable_image_fetching = False
goose_config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7"
#goose_config.parser_class = 'soup'
goose_config.stopwords_class = StopWordsChinese

g = Goose(config=goose_config)

url_file = '/data/algorithm/urlcontent'

address = ('192.168.32.5', 10888)

class Worker(object):
    '''
    子进程运行的代码,通过起一个协程来和主进程通信
    包括接受任务分配请求,退出信号(零字节包),及反馈任务执行进度
Пример #14
0
def extractArticle(url):
    from goose.configuration import Configuration
    config = Configuration()
    config.local_storage_path = tmp_dir
    return Goose(config).extract(url=url)