def test_download_file_success(self): url = "file://" + os.path.join(HTML_FN, "cnn_article.html") article = Article(url=url) article.download() self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS) self.assertEqual(article.download_exception_msg, None) self.assertEqual(75406, len(article.html))
def check_url(args): """ :param (basestr, basestr) url, res_filename: :return: (pubdate_failed, fulltext_failed) """ url, res_filename = args pubdate_failed, fulltext_failed = False, False html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() if a.publish_date is None: pubdate_failed = True except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() pubdate_failed, fulltext_failed = True, True else: correct_text = mock_resource_with(res_filename, 'txt') if not (a.text == correct_text): # print('Diff: ', simplediff.diff(correct_text, a.text)) # `correct_text` holds the reason of failure if failure print('%s -- %s -- %s' % ('Fulltext failed', res_filename, correct_text.strip())) fulltext_failed = True # TODO: assert statements are commented out for full-text # extraction tests because we are constantly tweaking the # algorithm and improving # assert a.text == correct_text return pubdate_failed, fulltext_failed
def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press')
def test_thai_fulltext_extract(self): url = 'https://prachatai.com/journal/2019/01/80642' article = Article(url=url, language='th') html = mock_resource_with('thai_article', 'html') article.download(html) article.parse() text = mock_resource_with('thai', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'th'))
def test_japanese_fulltext_extract2(self): url = 'http://www.afpbb.com/articles/-/3178894' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article2', 'html') article.download(html) article.parse() text = mock_resource_with('japanese2', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ja'))
def test_japanese_fulltext_extract(self): url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article', 'html') article.download(html) article.parse() text = mock_resource_with('japanese', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ja'))
def test_chinese_fulltext_extract(self): url = 'http://news.sohu.com/20050601/n225789219.shtml' article = Article(url=url, language='zh') html = mock_resource_with('chinese_article', 'html') article.download(html) article.parse() text = mock_resource_with('chinese', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'zh'))
def test_download_file_failure(self): url = "file://" + os.path.join(HTML_FN, "does_not_exist.html") article = Article(url=url) article.download() self.assertEqual(0, len(article.html)) self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE) self.assertEqual(article.download_exception_msg, "No such file or directory")
def test_spanish_fulltext_extract(self): url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \ 'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html' article = Article(url=url, language='es') html = mock_resource_with('spanish_article', 'html') article.download(html) article.parse() text = mock_resource_with('spanish', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'es'))
def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain')
def test_arabic_fulltext_extract(self): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \ 'index.html' article = Article(url=url) html = mock_resource_with('arabic_article', 'html') article.download(html) article.parse() self.assertEqual('ar', article.meta_lang) text = mock_resource_with('arabic', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ar'))
def test_article_pdf_ignoring(self): empty_pdf = "%PDF-" # empty PDF constant a = Article(url='http://www.technik-medien.at/ePaper_Download/' 'IoT4Industry+Business_2018-10-31_2018-03.pdf', ignored_content_types_defaults={ "application/pdf": empty_pdf, "application/x-pdf": empty_pdf, "application/x-bzpdf": empty_pdf, "application/x-gzpdf": empty_pdf }) a.download() self.assertEqual(empty_pdf, a.html)
def get_news(url): article = Article(url, language="hi") try: article.download() article.parse() article.nlp() except ArticleException: logger.exception(f"Error: Download timeout: {url}") return data = { "date": article.publish_date, "title": article.title, "keywords": article.keywords, "summary": article.summary, "text": article.text, "img_url": article.top_image, "video": article.movies, "url": url, } logger.info(f"Got news:{prettify(data)}") return data
def test_article_pdf_fetching(self): a = Article( url= 'https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf') a.download() self.assertNotEqual('%PDF-', a.html)
class ArticleTestCase(unittest.TestCase): def setup_stage(self, stage_name): stages = OrderedDict([ ('initial', lambda: None), ('download', lambda: self.article.download( mock_resource_with('cnn_article', 'html'))), ('parse', lambda: self.article.parse()), ('meta', lambda: None), # Alias for nlp ('nlp', lambda: self.article.nlp()) ]) assert stage_name in stages for name, action in stages.items(): if name == stage_name: break action() def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') @print_test def test_url(self): self.assertEqual( 'http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch', self.article.url) @print_test def test_download_html(self): self.setup_stage('download') html = mock_resource_with('cnn_article', 'html') self.article.download(html) self.assertEqual(self.article.download_state, ArticleDownloadState.SUCCESS) self.assertEqual(self.article.download_exception_msg, None) self.assertEqual(75406, len(self.article.html)) @print_test def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain') @print_test def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press') @print_test def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse) @print_test def test_parse_html(self): self.setup_stage('parse') AUTHORS = [ 'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins' ] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' META_SITE_NAME = 'CNN' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.assertEqual(text, self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual(META_SITE_NAME, self.article.meta_site_name) self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date)) @print_test def test_meta_type_extraction(self): self.setup_stage('meta') meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) self.assertEqual('article', meta_type) @print_test def test_meta_extraction(self): self.setup_stage('meta') meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict( dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': { 'site_name': 'CNN', 'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article' }, 'section': 'travel', 'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': { 'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html' }, 'source': 'CNN', 'fb': { 'page_id': 18793419640, 'app_id': 80401312489 }, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': { 'publisher': 'https://www.facebook.com/cnninternational' }, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': { 'site': { 'identifier': '@CNNI', 'id': 2097571 }, 'card': 'summary', 'creator': { 'identifier': '@cnntravel', 'id': 174377718 } }, 'viewport': 'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) self.assertDictEqual(META_DATA, meta) # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = [v for v in list(meta.values()) if isinstance(v, dict)] self.assertTrue(all([len(d) > 0 for d in dict_values])) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) self.assertEqual(5, len([i for i in meta.values() if is_dict(i)])) # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, str) self.assertEqual(12, len([i for i in meta.values() if is_string(i)])) @print_test def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article """ self.setup_stage('initial') new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp) @print_test def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ self.setup_stage('parse') self.assertRaises(ArticleException, self.article.nlp) @print_test def test_nlp_body(self): self.setup_stage('nlp') self.article.nlp() KEYWORDS = [ 'balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing', 'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds', 'york' ] SUMMARY = mock_resource_with('cnn_summary', 'txt') self.assertEqual(SUMMARY, self.article.summary) self.assertCountEqual(KEYWORDS, self.article.keywords)