def extract(results): try: config = Configuration() config.fetch_images = False req = urllib.request.Request( results["url"], headers={ 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919" }) con = urllib.request.urlopen(req, timeout=10) html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128]) article = Article(url='', config=config) article.set_html(html) article.parse() text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)]) if len(text) < 300: article = Article(url='', config=config, language="id") article.set_html(html) article.parse() text = ''.join( [i if ord(i) < 128 else ' ' for i in str(article.text)]) text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') print("=", end='', flush=True) return (results["url"], results["title"], text, article.publish_date) except Exception as e: print(e) return (results["url"], results["title"], None, None)
def retrieve_article(url): try: config = Configuration() config.fetch_images = False req = urllib.request.Request( url, headers={ 'User-Agent': "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919" }) con = urllib.request.urlopen(req, timeout=10) html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128]) article = Article(url='', config=config) article.set_html(html) article.parse() text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)]) if len(text) < 300: article = Article(url='', config=config, language="id") article.set_html(html) article.parse() text = ''.join( [i if ord(i) < 128 else ' ' for i in str(article.text)]) text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') return text except Exception as e: print(e) return False
def clean(html_content): config = Configuration() config.fetch_images = False # TODO: allow URL passing article = Article("http://example.com", config=config) article.set_html(html_content) article.is_downloaded = True article.parse() return article.text
from newspaper.configuration import Configuration from difflib import SequenceMatcher from urllib.parse import urlparse ZIPS = 'zips' blocklisted = ['http://www.legacy.com/'] with open('article_template.json') as file: article_template = json.load(file) alog = open('article_log1.log', 'a') slog = open('sources_log1.log', 'a') ## TODO do not fetch images config = Configuration() config.fetch_images = False def similar(a, b): return SequenceMatcher(None, a, b).ratio() def scrape_source(source): try: news_source = newspaper.build(source['url'], config=config) store_articles(source, news_source) except Exception as e: slog.write('\n' + datetime.now().isoformat() + '\t' + str(e)) def store_articles(source, news_source):
'.js', '.css', '.png', '.jpg', '.jpeg', '.pdf', '.ico', '.gif', '.m4a', '.woff2' ] BLACKLIST_REGEX = [ 'http[s]?://(.*)signout(.*)' ] NEWSPAPER_CONFIG = Configuration() NEWSPAPER_CONFIG.fetch_images = False NEWSPAPER_CONFIG.memoize_articles = False class BaseCrawler: # Crawler Identifier crawler_id = 'com.base' # Rate limit configuration requests_per_sec = 1 # robots.txt url robots_url = None # URLs of pages to crawl # start from start_url = []