def clean(html_content): config = Configuration() config.enable_image_fetching = False extractor = Goose(config=config) article = extractor.extract(raw_html=html_content) return article.cleaned_text
def getArticle(self, url, raw_html, language=None): config = Configuration() if language: config.target_language = language config.use_meta_language = False config.enable_image_fetching = False g = Goose(config=config) article = g.extract(raw_html=raw_html) return article
def extend_config(self): if isinstance(self.config, dict): config = Configuration() for k, v in self.config.items(): if hasattr(config, k): setattr(config, k, v) self.config = config
def extract(cls, html, html_formated): potential_titles = [] soup = BeautifulSoup(html, 'html.parser') if soup.title: page_title = TitleExtractor.extract_text(soup.title) for split_char in TitleExtractor.SPLIT_CHARS: if split_char in page_title: page_title = page_title.split(split_char)[0].strip() potential_titles.append(page_title) for heading_tag in (soup.find_all('h1') + soup.find_all('h2')): potential_title = TitleExtractor.extract_text(heading_tag) if potential_title: potential_titles.append(potential_title) # Extract article from goose article = Article() article.raw_html = html article.raw_doc = html_formated article.doc = article.raw_doc try: goose_title = TitleExtractorGoose(Configuration(), article).get_title() except AttributeError, e: goose_title = None
def _goose_cleaned_text(cls, html, page_html): article = Article() article.raw_html = html article.raw_doc = page_html article.doc = article.raw_doc goose_extractor = ContentExtractor(Configuration(), article) goose_cleaner = DocumentCleaner(Configuration(), article) goose_formatter = OutputFormatter(Configuration(), article) # goose_image_extractor = ImageExtractor(Configuration(), article) use article.doc = goose_cleaner.clean() article.top_node = goose_extractor.calculate_best_node() if article.top_node is not None: article.top_node = goose_extractor.post_cleanup() article.cleaned_text = goose_formatter.get_formatted_text() return article.cleaned_text
def getConfig(self): config = Configuration() config.enable_image_fetching = False return config
def getArticle(self, url, raw_html, language=None): config = Configuration() config.enable_image_fetching = False g = Goose(config=config) article = g.extract(url=url, raw_html=raw_html) return article
def __init__(self, config=None): self.config = config or Configuration() self.extend_config() self.initialize()
def getConfig(self): config = Configuration() config.enable_image_fetching = True return config
def test_tmp_not_overwritten(self): path = '/this/directory/does/not/exist/i/assume/' config = Configuration() self.assertRaises(AttributeError, lambda: setattr(config, 'local_storage_path', path))
def getConfig(self): config = Configuration() config.stopwords_class = StopWordsArabic return config
def getConfig(self): config = Configuration() config.stopwords_class = StopWordsChinese return config
import os,sys import time import subprocess import signal from httplib import IncompleteRead from gevent.pool import Pool import gevent.socket as socket from gevent.event import Event from goose import Goose from goose.configuration import Configuration from goose.text import StopWordsChinese import chardet import random goose_config = Configuration() goose_config.enable_image_fetching = False goose_config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7" #goose_config.parser_class = 'soup' goose_config.stopwords_class = StopWordsChinese g = Goose(config=goose_config) url_file = '/data/algorithm/urlcontent' address = ('192.168.32.5', 10888) class Worker(object): ''' 子进程运行的代码,通过起一个协程来和主进程通信 包括接受任务分配请求,退出信号(零字节包),及反馈任务执行进度
def extractArticle(url): from goose.configuration import Configuration config = Configuration() config.local_storage_path = tmp_dir return Goose(config).extract(url=url)