Exemplo n.º 1
0
 def test_download_file_success(self):
     url = "file://" + os.path.join(HTML_FN, "cnn_article.html")
     article = Article(url=url)
     article.download()
     self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS)
     self.assertEqual(article.download_exception_msg, None)
     self.assertEqual(75406, len(article.html))
Exemplo n.º 2
0
 def check_url(args):
     """
     :param (basestr, basestr) url, res_filename:
     :return: (pubdate_failed, fulltext_failed)
     """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed', res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed
Exemplo n.º 3
0
 def test_meta_refresh_no_url_redirect(self):
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article('', config=config)
     html = mock_resource_with('ap_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'News from The Associated Press')
Exemplo n.º 4
0
 def test_thai_fulltext_extract(self):
     url = 'https://prachatai.com/journal/2019/01/80642'
     article = Article(url=url, language='th')
     html = mock_resource_with('thai_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('thai', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'th'))
Exemplo n.º 5
0
 def test_japanese_fulltext_extract2(self):
     url = 'http://www.afpbb.com/articles/-/3178894'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article2', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese2', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Exemplo n.º 6
0
 def test_japanese_fulltext_extract(self):
     url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001'
     article = Article(url=url, language='ja')
     html = mock_resource_with('japanese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('japanese', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ja'))
Exemplo n.º 7
0
 def test_chinese_fulltext_extract(self):
     url = 'http://news.sohu.com/20050601/n225789219.shtml'
     article = Article(url=url, language='zh')
     html = mock_resource_with('chinese_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('chinese', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'zh'))
Exemplo n.º 8
0
 def test_download_file_failure(self):
     url = "file://" + os.path.join(HTML_FN, "does_not_exist.html")
     article = Article(url=url)
     article.download()
     self.assertEqual(0, len(article.html))
     self.assertEqual(article.download_state,
                      ArticleDownloadState.FAILED_RESPONSE)
     self.assertEqual(article.download_exception_msg,
                      "No such file or directory")
Exemplo n.º 9
0
 def test_spanish_fulltext_extract(self):
     url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \
           'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
     article = Article(url=url, language='es')
     html = mock_resource_with('spanish_article', 'html')
     article.download(html)
     article.parse()
     text = mock_resource_with('spanish', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'es'))
Exemplo n.º 10
0
 def test_meta_refresh_redirect(self):
     # TODO: We actually hit example.com in this unit test ... which is bad
     # Figure out how to mock an actual redirect
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article('', config=config)
     html = mock_resource_with('google_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'Example Domain')
Exemplo n.º 11
0
 def test_arabic_fulltext_extract(self):
     url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \
           'index.html'
     article = Article(url=url)
     html = mock_resource_with('arabic_article', 'html')
     article.download(html)
     article.parse()
     self.assertEqual('ar', article.meta_lang)
     text = mock_resource_with('arabic', 'txt')
     self.assertEqual(text, article.text)
     self.assertEqual(text, fulltext(article.html, 'ar'))
Exemplo n.º 12
0
 def test_article_pdf_ignoring(self):
     empty_pdf = "%PDF-"  # empty PDF constant
     a = Article(url='http://www.technik-medien.at/ePaper_Download/'
                 'IoT4Industry+Business_2018-10-31_2018-03.pdf',
                 ignored_content_types_defaults={
                     "application/pdf": empty_pdf,
                     "application/x-pdf": empty_pdf,
                     "application/x-bzpdf": empty_pdf,
                     "application/x-gzpdf": empty_pdf
                 })
     a.download()
     self.assertEqual(empty_pdf, a.html)
Exemplo n.º 13
0
def get_news(url):
    article = Article(url, language="hi")
    try:
        article.download()
        article.parse()
        article.nlp()
    except ArticleException:
        logger.exception(f"Error: Download timeout: {url}")
        return

    data = {
        "date": article.publish_date,
        "title": article.title,
        "keywords": article.keywords,
        "summary": article.summary,
        "text": article.text,
        "img_url": article.top_image,
        "video": article.movies,
        "url": url,
    }
    logger.info(f"Got news:{prettify(data)}")
    return data
Exemplo n.º 14
0
 def test_article_pdf_fetching(self):
     a = Article(
         url=
         'https://www.adobe.com/pdf/pdfs/ISO32000-1PublicPatentLicense.pdf')
     a.download()
     self.assertNotEqual('%PDF-', a.html)
Exemplo n.º 15
0
class ArticleTestCase(unittest.TestCase):
    def setup_stage(self, stage_name):
        stages = OrderedDict([
            ('initial', lambda: None),
            ('download', lambda: self.article.download(
                mock_resource_with('cnn_article', 'html'))),
            ('parse', lambda: self.article.parse()),
            ('meta', lambda: None),  # Alias for nlp
            ('nlp', lambda: self.article.nlp())
        ])
        assert stage_name in stages
        for name, action in stages.items():
            if name == stage_name:
                break
            action()

    def setUp(self):
        """Called before the first test case of this unit begins
        """
        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch')

    @print_test
    def test_url(self):
        self.assertEqual(
            'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch', self.article.url)

    @print_test
    def test_download_html(self):
        self.setup_stage('download')
        html = mock_resource_with('cnn_article', 'html')
        self.article.download(html)
        self.assertEqual(self.article.download_state,
                         ArticleDownloadState.SUCCESS)
        self.assertEqual(self.article.download_exception_msg, None)
        self.assertEqual(75406, len(self.article.html))

    @print_test
    def test_meta_refresh_redirect(self):
        # TODO: We actually hit example.com in this unit test ... which is bad
        # Figure out how to mock an actual redirect
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article('', config=config)
        html = mock_resource_with('google_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'Example Domain')

    @print_test
    def test_meta_refresh_no_url_redirect(self):
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article('', config=config)
        html = mock_resource_with('ap_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'News from The Associated Press')

    @print_test
    def test_pre_download_parse(self):
        """Calling `parse()` before `download()` should yield an error
        """
        article = Article(self.article.url)
        self.assertRaises(ArticleException, article.parse)

    @print_test
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = [
            'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
            'Tom Watkins'
        ]
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'
        META_SITE_NAME = 'CNN'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.assertEqual(text, self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual(META_SITE_NAME, self.article.meta_site_name)
        self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))

    @print_test
    def test_meta_type_extraction(self):
        self.setup_stage('meta')
        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        self.assertEqual('article', meta_type)

    @print_test
    def test_meta_extraction(self):
        self.setup_stage('meta')
        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(
            dict, {
                'medium':
                'news',
                'googlebot':
                'noarchive',
                'pubdate':
                '2013-11-27T08:36:32Z',
                'title':
                'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
                'og': {
                    'site_name': 'CNN',
                    'description':
                    'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.',
                    'title':
                    'After storm, forecasters see smooth sailing for Thanksgiving',
                    'url':
                    'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                    'image':
                    'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg',
                    'type': 'article'
                },
                'section':
                'travel',
                'author':
                'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN',
                'robots':
                'index,follow',
                'vr': {
                    'canonical':
                    'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'
                },
                'source':
                'CNN',
                'fb': {
                    'page_id': 18793419640,
                    'app_id': 80401312489
                },
                'keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
                'article': {
                    'publisher': 'https://www.facebook.com/cnninternational'
                },
                'lastmod':
                '2013-11-28T02:03:23Z',
                'twitter': {
                    'site': {
                        'identifier': '@CNNI',
                        'id': 2097571
                    },
                    'card': 'summary',
                    'creator': {
                        'identifier': '@cnntravel',
                        'id': 174377718
                    }
                },
                'viewport':
                'width=1024',
                'news_keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
            })

        self.assertDictEqual(META_DATA, meta)

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = [v for v in list(meta.values()) if isinstance(v, dict)]
        self.assertTrue(all([len(d) > 0 for d in dict_values]))

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        self.assertEqual(5, len([i for i in meta.values() if is_dict(i)]))

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, str)
        self.assertEqual(12, len([i for i in meta.values() if is_string(i)]))

    @print_test
    def test_pre_download_nlp(self):
        """Test running NLP algos before even downloading the article
        """
        self.setup_stage('initial')
        new_article = Article(self.article.url)
        self.assertRaises(ArticleException, new_article.nlp)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLP algos before parsing the article
        """
        self.setup_stage('parse')
        self.assertRaises(ArticleException, self.article.nlp)

    @print_test
    def test_nlp_body(self):
        self.setup_stage('nlp')
        self.article.nlp()
        KEYWORDS = [
            'balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing',
            'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds',
            'york'
        ]
        SUMMARY = mock_resource_with('cnn_summary', 'txt')
        self.assertEqual(SUMMARY, self.article.summary)
        self.assertCountEqual(KEYWORDS, self.article.keywords)