示例#1
0
def clean(html_content):
    config = Configuration()
    config.enable_image_fetching = False
    extractor = Goose(config=config)

    article = extractor.extract(raw_html=html_content)

    return article.cleaned_text
示例#2
0
 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     if language:
         config.target_language = language
         config.use_meta_language = False
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(raw_html=raw_html)
     return article
示例#3
0
 def extend_config(self):
     if isinstance(self.config, dict):
         config = Configuration()
         for k, v in self.config.items():
             if hasattr(config, k):
                 setattr(config, k, v)
         self.config = config
示例#4
0
    def extract(cls, html, html_formated):

        potential_titles = []
        soup = BeautifulSoup(html, 'html.parser')

        if soup.title:
            page_title = TitleExtractor.extract_text(soup.title)

            for split_char in TitleExtractor.SPLIT_CHARS:
                if split_char in page_title:
                    page_title = page_title.split(split_char)[0].strip()

            potential_titles.append(page_title)

        for heading_tag in (soup.find_all('h1') + soup.find_all('h2')):
            potential_title = TitleExtractor.extract_text(heading_tag)
            if potential_title:
                potential_titles.append(potential_title)

        # Extract article from goose
        article = Article()
        article.raw_html = html
        article.raw_doc = html_formated
        article.doc = article.raw_doc
        try:
            goose_title = TitleExtractorGoose(Configuration(),
                                              article).get_title()
        except AttributeError, e:
            goose_title = None
示例#5
0
    def _goose_cleaned_text(cls, html, page_html):
        article = Article()
        article.raw_html = html
        article.raw_doc = page_html
        article.doc = article.raw_doc

        goose_extractor = ContentExtractor(Configuration(), article)
        goose_cleaner = DocumentCleaner(Configuration(), article)
        goose_formatter = OutputFormatter(Configuration(), article)
        # goose_image_extractor = ImageExtractor(Configuration(), article) use

        article.doc = goose_cleaner.clean()
        article.top_node = goose_extractor.calculate_best_node()

        if article.top_node is not None:
            article.top_node = goose_extractor.post_cleanup()
            article.cleaned_text = goose_formatter.get_formatted_text()

        return article.cleaned_text
示例#6
0
 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = False
     return config
示例#7
0
 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(url=url, raw_html=raw_html)
     return article
示例#8
0
 def __init__(self, config=None):
     self.config = config or Configuration()
     self.extend_config()
     self.initialize()
示例#9
0
 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = True
     return config
示例#10
0
 def test_tmp_not_overwritten(self):
     path = '/this/directory/does/not/exist/i/assume/'
     config = Configuration()
     self.assertRaises(AttributeError, lambda: setattr(config, 'local_storage_path', path))
示例#11
0
 def getConfig(self):
     config = Configuration()
     config.stopwords_class = StopWordsArabic
     return config
示例#12
0
 def getConfig(self):
     config = Configuration()
     config.stopwords_class = StopWordsChinese
     return config
示例#13
0
import os,sys
import time
import subprocess
import signal
from httplib import IncompleteRead
from gevent.pool import Pool
import gevent.socket as socket
from gevent.event import Event
from goose import Goose
from goose.configuration import Configuration
from goose.text import StopWordsChinese
import chardet
import random

goose_config = Configuration()
goose_config.enable_image_fetching = False
goose_config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7"
#goose_config.parser_class = 'soup'
goose_config.stopwords_class = StopWordsChinese

g = Goose(config=goose_config)

url_file = '/data/algorithm/urlcontent'

address = ('192.168.32.5', 10888)

class Worker(object):
    '''
    子进程运行的代码,通过起一个协程来和主进程通信
    包括接受任务分配请求,退出信号(零字节包),及反馈任务执行进度
示例#14
0
def extractArticle(url):
    from goose.configuration import Configuration
    config = Configuration()
    config.local_storage_path = tmp_dir
    return Goose(config).extract(url=url)