Пример #1
0
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = [
            'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
            'Tom Watkins'
        ]
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'
        META_SITE_NAME = 'CNN'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.assertEqual(text, self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual(META_SITE_NAME, self.article.meta_site_name)
        self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))
Пример #2
0
 def test_thai_fulltext_extract(self):
     url = 'https://prachatai.com/journal/2019/01/80642'
     article = Article(url=url, language='th')
     html = mock_resource_with('thai_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('thai', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'th'))
Пример #3
0
 def test_japanese_fulltext_extract2(self):
     url = 'http://www.afpbb.com/articles/-/3178894'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article2', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese2', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Пример #4
0
 def test_japanese_fulltext_extract(self):
     url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Пример #5
0
 def test_chinese_fulltext_extract(self):
     url = 'http://news.sohu.com/20050601/n225789219.shtml'
     article = Article(url=url, language='zh')
     html = mock_resource_with('chinese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('chinese', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'zh'))
Пример #6
0
 def test_spanish_fulltext_extract(self):
     url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \
           'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
     article = Article(url=url, language='es')
     html = mock_resource_with('spanish_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('spanish', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'es'))
Пример #7
0
 def test_arabic_fulltext_extract(self):
     url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \
           'index.html'
     article = Article(url=url)
     html = mock_resource_with('arabic_article', 'html')
     article.download(html)
     article.parse()
     self.assertEqual('ar', article.meta_lang)
     text = mock_resource_with('arabic', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ar'))