Python Configuration示例

编程语言: Python

命名空间/包名称: goose.configuration

类/类型: Configuration

hotexamples.com的示例: 14

Python Configuration - 已找到14个示例。这些是从开源项目中提取的最受好评的goose.configuration.Configuration现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Configuration(11)

enable_image_fetching(5)

browser_user_agent(1)

local_storage_path(1)

stopwords_class(1)

target_language(1)

use_meta_language(1)

示例#1

显示文件

文件： goose.py 项目： aniav/python-laundry

def clean(html_content):
    config = Configuration()
    config.enable_image_fetching = False
    extractor = Goose(config=config)

    article = extractor.extract(raw_html=html_content)

    return article.cleaned_text

示例#2

显示文件

文件： tests.py 项目： gaybro8777/python-goose

 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     if language:
         config.target_language = language
         config.use_meta_language = False
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(raw_html=raw_html)
     return article

示例#3

显示文件

 def extend_config(self):
     if isinstance(self.config, dict):
         config = Configuration()
         for k, v in self.config.items():
             if hasattr(config, k):
                 setattr(config, k, v)
         self.config = config

示例#4

显示文件

    def extract(cls, html, html_formated):

        potential_titles = []
        soup = BeautifulSoup(html, 'html.parser')

        if soup.title:
            page_title = TitleExtractor.extract_text(soup.title)

            for split_char in TitleExtractor.SPLIT_CHARS:
                if split_char in page_title:
                    page_title = page_title.split(split_char)[0].strip()

            potential_titles.append(page_title)

        for heading_tag in (soup.find_all('h1') + soup.find_all('h2')):
            potential_title = TitleExtractor.extract_text(heading_tag)
            if potential_title:
                potential_titles.append(potential_title)

        # Extract article from goose
        article = Article()
        article.raw_html = html
        article.raw_doc = html_formated
        article.doc = article.raw_doc
        try:
            goose_title = TitleExtractorGoose(Configuration(),
                                              article).get_title()
        except AttributeError, e:
            goose_title = None

示例#5

显示文件

    def _goose_cleaned_text(cls, html, page_html):
        article = Article()
        article.raw_html = html
        article.raw_doc = page_html
        article.doc = article.raw_doc

        goose_extractor = ContentExtractor(Configuration(), article)
        goose_cleaner = DocumentCleaner(Configuration(), article)
        goose_formatter = OutputFormatter(Configuration(), article)
        # goose_image_extractor = ImageExtractor(Configuration(), article) use

        article.doc = goose_cleaner.clean()
        article.top_node = goose_extractor.calculate_best_node()

        if article.top_node is not None:
            article.top_node = goose_extractor.post_cleanup()
            article.cleaned_text = goose_formatter.get_formatted_text()

        return article.cleaned_text

示例#6

显示文件

文件： base.py 项目： 437072341/python-goose

 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = False
     return config

示例#7

显示文件

文件： tests.py 项目： gaybro8777/python-goose

 def getArticle(self, url, raw_html, language=None):
     config = Configuration()
     config.enable_image_fetching = False
     g = Goose(config=config)
     article = g.extract(url=url, raw_html=raw_html)
     return article

示例#8

显示文件

 def __init__(self, config=None):
     self.config = config or Configuration()
     self.extend_config()
     self.initialize()

示例#9

显示文件

 def getConfig(self):
     config = Configuration()
     config.enable_image_fetching = True
     return config

示例#10

显示文件

 def test_tmp_not_overwritten(self):
     path = '/this/directory/does/not/exist/i/assume/'
     config = Configuration()
     self.assertRaises(AttributeError, lambda: setattr(config, 'local_storage_path', path))

示例#11

显示文件

文件： extractors.py 项目： pansuo/python-goose

 def getConfig(self):
     config = Configuration()
     config.stopwords_class = StopWordsArabic
     return config

示例#12

显示文件

文件： extractors.py 项目： pansuo/python-goose

 def getConfig(self):
     config = Configuration()
     config.stopwords_class = StopWordsChinese
     return config

示例#13

显示文件

import os,sys
import time
import subprocess
import signal
from httplib import IncompleteRead
from gevent.pool import Pool
import gevent.socket as socket
from gevent.event import Event
from goose import Goose
from goose.configuration import Configuration
from goose.text import StopWordsChinese
import chardet
import random

goose_config = Configuration()
goose_config.enable_image_fetching = False
goose_config.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.52.7 (KHTML, like Gecko) Version/5.1.2 Safari/534.52.7"
#goose_config.parser_class = 'soup'
goose_config.stopwords_class = StopWordsChinese

g = Goose(config=goose_config)

url_file = '/data/algorithm/urlcontent'

address = ('192.168.32.5', 10888)

class Worker(object):
    '''
    子进程运行的代码,通过起一个协程来和主进程通信
    包括接受任务分配请求，退出信号(零字节包)，及反馈任务执行进度

示例#14

显示文件

def extractArticle(url):
    from goose.configuration import Configuration
    config = Configuration()
    config.local_storage_path = tmp_dir
    return Goose(config).extract(url=url)