Python Article.nlp 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: newspaper_wrapper

클래스/타입: Article

메소드/함수: nlp

hotexamples.com에서의 예제들: 2

Python Article.nlp - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 newspaper_wrapper.Article.nlp에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Article(19)

download(15)

parse(11)

nlp(2)

자주 사용되는 메소드들

Article (19)

download (15)

parse (11)

nlp (2)

예제 #1

파일 보기

def get_news(url):
    article = Article(url, language="hi")
    try:
        article.download()
        article.parse()
        article.nlp()
    except ArticleException:
        logger.exception(f"Error: Download timeout: {url}")
        return

    data = {
        "date": article.publish_date,
        "title": article.title,
        "keywords": article.keywords,
        "summary": article.summary,
        "text": article.text,
        "img_url": article.top_image,
        "video": article.movies,
        "url": url,
    }
    logger.info(f"Got news:{prettify(data)}")
    return data

예제 #2

파일 보기

파일: unit_tests.py 프로젝트: pykancha/newspaper3k_wrapper

class ArticleTestCase(unittest.TestCase):
    def setup_stage(self, stage_name):
        stages = OrderedDict([
            ('initial', lambda: None),
            ('download', lambda: self.article.download(
                mock_resource_with('cnn_article', 'html'))),
            ('parse', lambda: self.article.parse()),
            ('meta', lambda: None),  # Alias for nlp
            ('nlp', lambda: self.article.nlp())
        ])
        assert stage_name in stages
        for name, action in stages.items():
            if name == stage_name:
                break
            action()

    def setUp(self):
        """Called before the first test case of this unit begins
        """
        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch')

    @print_test
    def test_url(self):
        self.assertEqual(
            'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html?iref=allsearch', self.article.url)

    @print_test
    def test_download_html(self):
        self.setup_stage('download')
        html = mock_resource_with('cnn_article', 'html')
        self.article.download(html)
        self.assertEqual(self.article.download_state,
                         ArticleDownloadState.SUCCESS)
        self.assertEqual(self.article.download_exception_msg, None)
        self.assertEqual(75406, len(self.article.html))

    @print_test
    def test_meta_refresh_redirect(self):
        # TODO: We actually hit example.com in this unit test ... which is bad
        # Figure out how to mock an actual redirect
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article('', config=config)
        html = mock_resource_with('google_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'Example Domain')

    @print_test
    def test_meta_refresh_no_url_redirect(self):
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article('', config=config)
        html = mock_resource_with('ap_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'News from The Associated Press')

    @print_test
    def test_pre_download_parse(self):
        """Calling `parse()` before `download()` should yield an error
        """
        article = Article(self.article.url)
        self.assertRaises(ArticleException, article.parse)

    @print_test
    def test_parse_html(self):
        self.setup_stage('parse')

        AUTHORS = [
            'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey',
            'Tom Watkins'
        ]
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'
        META_SITE_NAME = 'CNN'

        self.article.parse()
        self.article.nlp()

        text = mock_resource_with('cnn', 'txt')
        self.assertEqual(text, self.article.text)
        self.assertEqual(text, fulltext(self.article.html))

        # NOTE: top_img extraction requires an internet connection
        # unlike the rest of this test file
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        self.assertEqual(TOP_IMG, self.article.top_img)

        self.assertCountEqual(AUTHORS, self.article.authors)
        self.assertEqual(TITLE, self.article.title)
        self.assertEqual(LEN_IMGS, len(self.article.imgs))
        self.assertEqual(META_LANG, self.article.meta_lang)
        self.assertEqual(META_SITE_NAME, self.article.meta_site_name)
        self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))

    @print_test
    def test_meta_type_extraction(self):
        self.setup_stage('meta')
        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        self.assertEqual('article', meta_type)

    @print_test
    def test_meta_extraction(self):
        self.setup_stage('meta')
        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(
            dict, {
                'medium':
                'news',
                'googlebot':
                'noarchive',
                'pubdate':
                '2013-11-27T08:36:32Z',
                'title':
                'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
                'og': {
                    'site_name': 'CNN',
                    'description':
                    'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.',
                    'title':
                    'After storm, forecasters see smooth sailing for Thanksgiving',
                    'url':
                    'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html',
                    'image':
                    'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg',
                    'type': 'article'
                },
                'section':
                'travel',
                'author':
                'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN',
                'robots':
                'index,follow',
                'vr': {
                    'canonical':
                    'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'
                },
                'source':
                'CNN',
                'fb': {
                    'page_id': 18793419640,
                    'app_id': 80401312489
                },
                'keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
                'article': {
                    'publisher': 'https://www.facebook.com/cnninternational'
                },
                'lastmod':
                '2013-11-28T02:03:23Z',
                'twitter': {
                    'site': {
                        'identifier': '@CNNI',
                        'id': 2097571
                    },
                    'card': 'summary',
                    'creator': {
                        'identifier': '@cnntravel',
                        'id': 174377718
                    }
                },
                'viewport':
                'width=1024',
                'news_keywords':
                'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm'
            })

        self.assertDictEqual(META_DATA, meta)

        # if the value for a meta key is another dict, that dict ought to be
        # filled with keys and values
        dict_values = [v for v in list(meta.values()) if isinstance(v, dict)]
        self.assertTrue(all([len(d) > 0 for d in dict_values]))

        # there are exactly 5 top-level "og:type" type keys
        is_dict = lambda v: isinstance(v, dict)
        self.assertEqual(5, len([i for i in meta.values() if is_dict(i)]))

        # there are exactly 12 top-level "pubdate" type keys
        is_string = lambda v: isinstance(v, str)
        self.assertEqual(12, len([i for i in meta.values() if is_string(i)]))

    @print_test
    def test_pre_download_nlp(self):
        """Test running NLP algos before even downloading the article
        """
        self.setup_stage('initial')
        new_article = Article(self.article.url)
        self.assertRaises(ArticleException, new_article.nlp)

    @print_test
    def test_pre_parse_nlp(self):
        """Test running NLP algos before parsing the article
        """
        self.setup_stage('parse')
        self.assertRaises(ArticleException, self.article.nlp)

    @print_test
    def test_nlp_body(self):
        self.setup_stage('nlp')
        self.article.nlp()
        KEYWORDS = [
            'balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing',
            'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds',
            'york'
        ]
        SUMMARY = mock_resource_with('cnn_summary', 'txt')
        self.assertEqual(SUMMARY, self.article.summary)
        self.assertCountEqual(KEYWORDS, self.article.keywords)