Python Configuration.fetch_images 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: newspaper.configuration

클래스/타입: Configuration

메소드/함수: fetch_images

hotexamples.com에서의 예제들: 5

Python Configuration.fetch_images - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 newspaper.configuration.Configuration.fetch_images에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Configuration(26)

language(6)

fetch_images(5)

follow_meta_refresh(4)

get_parser(4)

browser_user_agent(3)

memoize_articles(3)

keep_article_html(2)

MAX_AUTHORS(1)

MAX_TITLE(1)

MIN_WORD_COUNT(1)

is_memoize_articles(1)

verbose(1)

예제 #1

파일 보기

파일: main.py 프로젝트: hoaxanalyzer/hoax-search-vote

def extract(results):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            results["url"],
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

        print("=", end='', flush=True)
        return (results["url"], results["title"], text, article.publish_date)
    except Exception as e:
        print(e)
        return (results["url"], results["title"], None, None)

예제 #2

파일 보기

def retrieve_article(url):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            url,
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        return text
    except Exception as e:
        print(e)
        return False

예제 #3

파일 보기

파일: newspaper.py 프로젝트: aniav/python-laundry

def clean(html_content):
    config = Configuration()
    config.fetch_images = False

    # TODO: allow URL passing
    article = Article("http://example.com", config=config)
    article.set_html(html_content)
    article.is_downloaded = True
    article.parse()

    return article.text

예제 #4

파일 보기

from newspaper.configuration import Configuration
from difflib import SequenceMatcher
from urllib.parse import urlparse

ZIPS = 'zips'
blocklisted = ['http://www.legacy.com/']

with open('article_template.json') as file:
    article_template = json.load(file)

alog = open('article_log1.log', 'a')
slog = open('sources_log1.log', 'a')

## TODO do not fetch images
config = Configuration()
config.fetch_images = False


def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


def scrape_source(source):
    try:
        news_source = newspaper.build(source['url'], config=config)
        store_articles(source, news_source)
    except Exception as e:
        slog.write('\n' + datetime.now().isoformat() + '\t' + str(e))


def store_articles(source, news_source):

예제 #5

파일 보기

파일: base_crawler.py 프로젝트: Jonoans/ScoutSpyder

    '.js',
    '.css',
    '.png',
    '.jpg',
    '.jpeg',
    '.pdf',
    '.ico',
    '.gif',
    '.m4a',
    '.woff2'
]
BLACKLIST_REGEX = [
    'http[s]?://(.*)signout(.*)'
]
NEWSPAPER_CONFIG = Configuration()
NEWSPAPER_CONFIG.fetch_images = False
NEWSPAPER_CONFIG.memoize_articles = False

class BaseCrawler:
    # Crawler Identifier
    crawler_id = 'com.base'

    # Rate limit configuration
    requests_per_sec = 1

    # robots.txt url
    robots_url = None

    # URLs of pages to crawl
    # start from
    start_url = []