def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print('Slate has %d articles TC has %d articles ESPN has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html) print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)
def test_download_works(self): config = Configuration() config.memoize_articles = False slate_paper = newspaper.build('http://slate.com', config=config) tc_paper = newspaper.build('http://techcrunch.com', config=config) espn_paper = newspaper.build('http://espn.com', config=config) print ('slate has %d articles tc has %d articles espn has %d articles' % (slate_paper.size(), tc_paper.size(), espn_paper.size())) papers = [slate_paper, tc_paper, espn_paper] news_pool.set(papers, threads_per_source=2) news_pool.join() print 'Downloaded slate mthread len', len(slate_paper.articles[0].html) print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html) print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
__author__ = "Vishal Jasrotia" __copyright__ = "" __credits__ = ["Vishal Jasrotia"] __license__ = "" __version__ = "1.0.0" __maintainer__ = "Vishal Jasrotia" __email__ = "*****@*****.**" __status__ = "Testing" from newsly.Builder import NewsBuilder from newspaper.configuration import Configuration if __name__ == "__main__": #pass config = Configuration() config = Configuration() config.memoize_articles = True # True in production config.MAX_AUTHORS = 2 config.MIN_WORD_COUNT = 300 #config.MAX_SUMMARY = 900 on text .Not on summary . dont use it #TODO : Have a separate ArticleConfig and SourceConfig extend this! builder = NewsBuilder(config) builder.build() builder.print_source_vs_article_url()
'.css', '.png', '.jpg', '.jpeg', '.pdf', '.ico', '.gif', '.m4a', '.woff2' ] BLACKLIST_REGEX = [ 'http[s]?://(.*)signout(.*)' ] NEWSPAPER_CONFIG = Configuration() NEWSPAPER_CONFIG.fetch_images = False NEWSPAPER_CONFIG.memoize_articles = False class BaseCrawler: # Crawler Identifier crawler_id = 'com.base' # Rate limit configuration requests_per_sec = 1 # robots.txt url robots_url = None # URLs of pages to crawl # start from start_url = []