def check_url(args): """ :param args: """ url, res_filename = args pubdate_failed, fulltext_failed = False, False html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() if a.publish_date is None: pubdate_failed = True print(f"BAD_PUBDATE={url}") except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() pubdate_failed, fulltext_failed = True, True else: correct_text = mock_resource_with(res_filename, 'txt') if not (a.text == correct_text): # print('Diff: ', simplediff.diff(correct_text, a.text)) # `correct_text` holds the reason of failure if failure print('%s -- %s -- %s' % ('Fulltext failed', res_filename, correct_text.strip())) fulltext_failed = True # TODO: assert statements are commented out for full-text # extraction tests because we are constantly tweaking the # algorithm and improving # assert a.text == correct_text return pubdate_failed, fulltext_failed
def test_thai_fulltext_extract(): url = 'https://prachatai.com/journal/2019/01/80642' article = Article(url=url, language='th') html = mock_resource_with('thai_article', 'html') article.download(html) article.parse() text = mock_resource_with('thai', 'txt') assert text == article.text assert text == fulltext(article.html, 'th')
def test_japanese_fulltext_extract2(): url = 'http://www.afpbb.com/articles/-/3178894' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article2', 'html') article.download(html) article.parse() text = mock_resource_with('japanese2', 'txt') assert text == article.text assert text == fulltext(article.html, 'ja')
def test_japanese_fulltext_extract(): url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article', 'html') article.download(html) article.parse() text = mock_resource_with('japanese', 'txt') assert text == article.text assert text == fulltext(article.html, 'ja')
def test_chinese_fulltext_extract(): url = 'http://news.sohu.com/20050601/n225789219.shtml' article = Article(url=url, language='zh') html = mock_resource_with('chinese_article', 'html') article.download(html) article.parse() text = mock_resource_with('chinese', 'txt') assert text == article.text assert text == fulltext(article.html, 'zh')
def test_spanish_fulltext_extract(): url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \ 'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html' article = Article(url=url, language='es') html = mock_resource_with('spanish_article', 'html') article.download(html) article.parse() text = mock_resource_with('spanish', 'txt') assert text == article.text assert text == fulltext(article.html, 'es')
def test_arabic_fulltext_extract(): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \ 'index.html' article = Article(url=url) html = mock_resource_with('arabic_article', 'html') article.download(html) article.parse() assert 'ar' == article.meta_lang text = mock_resource_with('arabic', 'txt') assert text == article.text assert text == fulltext(article.html, 'ar')
def test_parse_html(self): self.setup_stage('parse') AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' META_SITE_NAME = 'CNN' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.maxDiff=None self.assertEqual(text.strip(), self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual(META_SITE_NAME, self.article.meta_site_name) self.assertEqual('2013-11-27', str(self.article.publish_date))
def test_download_html(self): self.setup_stage('download') html = mock_resource_with('cnn_article', 'html') self.article.download(html) self.assertTrue(DOWNLOADED in self.article.workflow) self.assertEqual(self.article.download_exception_msg, None) self.assertEqual(75406, len(self.article.html))
def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article( '', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press')
def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain')
def setup_stage(self, stage_name): stages = OrderedDict([ ('initial', lambda: None), ('download', lambda: self.article.download( mock_resource_with('cnn_article', 'html'))), ('parse', lambda: self.article.parse()), ('meta', lambda: None), # Alias for nlp ('nlp', lambda: self.article.nlp()) ]) assert stage_name in stages for name, action in stages.items(): if name == stage_name: break action()
def test_nlp_body(self): self.setup_stage('nlp') self.article.nlp() KEYWORDS = [ 'storm', 'weather', 'new', 'york', 'flight', 'balloons', 'roads', 'delays', 'parade', 'people', 'winds', 'snow' ] self.assertCountEqual(KEYWORDS, self.article.keywords) SUMMARY = mock_resource_with('cnn_summary', 'txt') self.assertEqual(SUMMARY, self.article.summary)