예제 #1
0
def extract(results):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            results["url"],
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

        print("=", end='', flush=True)
        return (results["url"], results["title"], text, article.publish_date)
    except Exception as e:
        print(e)
        return (results["url"], results["title"], None, None)
예제 #2
0
def retrieve_article(url):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            url,
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        return text
    except Exception as e:
        print(e)
        return False
예제 #3
0
def clean(html_content):
    config = Configuration()
    config.fetch_images = False

    # TODO: allow URL passing
    article = Article("http://example.com", config=config)
    article.set_html(html_content)
    article.is_downloaded = True
    article.parse()

    return article.text
예제 #4
0
from newspaper.configuration import Configuration
from difflib import SequenceMatcher
from urllib.parse import urlparse

ZIPS = 'zips'
blocklisted = ['http://www.legacy.com/']

with open('article_template.json') as file:
    article_template = json.load(file)

alog = open('article_log1.log', 'a')
slog = open('sources_log1.log', 'a')

## TODO do not fetch images
config = Configuration()
config.fetch_images = False


def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


def scrape_source(source):
    try:
        news_source = newspaper.build(source['url'], config=config)
        store_articles(source, news_source)
    except Exception as e:
        slog.write('\n' + datetime.now().isoformat() + '\t' + str(e))


def store_articles(source, news_source):
예제 #5
0
    '.js',
    '.css',
    '.png',
    '.jpg',
    '.jpeg',
    '.pdf',
    '.ico',
    '.gif',
    '.m4a',
    '.woff2'
]
BLACKLIST_REGEX = [
    'http[s]?://(.*)signout(.*)'
]
NEWSPAPER_CONFIG = Configuration()
NEWSPAPER_CONFIG.fetch_images = False
NEWSPAPER_CONFIG.memoize_articles = False

class BaseCrawler:
    # Crawler Identifier
    crawler_id = 'com.base'

    # Rate limit configuration
    requests_per_sec = 1

    # robots.txt url
    robots_url = None

    # URLs of pages to crawl
    # start from
    start_url = []