def get_article(article_link, summary_length=5): ''' Extract article and summarize it ''' article = Article(article_link) article.build() return article
def test_chinese_fulltext_extract(self): url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml' article = Article(url=url, language='zh') article.build() # assert isinstance(article.stopwords_class, StopWordsChinese) with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f: assert article.text == f.read()
class NewsSearch: def __init__(self, searchterm): self.searchterm = searchterm self.texts = [] self.links = [] self.date = [] self.art = [] self.GoogleSearch() def GoogleSearch(self): self.url="https://news.google.com/rss/search?q=" + self.searchterm + "&hl=en-US&gl=US&ceid=US%3Aen" Client=urlopen(self.url) xml_page=Client.read() Client.close() soup_page=soup(xml_page,"xml") news_list=soup_page.findAll("item") # Print news title, url and publish date for news in news_list: self.texts.append(news.title.text) self.links.append(news.link.text) self.date.append(news.pubDate.text) for link in self.links: self.article = Article(link) try: self.article.build() self.article.nlp() except: pass finally: self.art.append(self.article)
def test_chinese_fulltext_extract(self): url = 'http://news.sohu.com/20050601/n225789219.shtml' mock_response_with(url, 'chinese_article') article = Article(url=url, language='zh') article.build() with codecs.open(os.path.join(TEXT_FN, 'chinese.txt'), 'r', 'utf8') as f: assert article.text == f.read()
def get_article_from_url(url): new_article = NewsItem(url=url) new_article.download() new_article.build() new_article.nlp() #print(new_article.__dict__) #print(new_article) return new_article
def test_arabic_fulltext_extract(self): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html' article = Article(url=url) article.build() assert article.meta_lang == 'ar' # assert isinstance(article.stopwords_class, StopWordsArabic) with codecs.open(os.path.join(TEXT_FN, 'arabic_text_1.txt'), 'r', 'utf8') as f: assert article.text == f.read()
def test(): url = 'http://www.bbc.com/news/world-europe-35828810' #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil' try: a = Article(url) a.build() process_and_save_article(a, 'bbc') except: print ("error detected")
def test_spanish_fulltext_extract(self): url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal'\ 'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html' mock_response_with(url, 'spanish_article') article = Article(url=url, language='es') article.build() with codecs.open(os.path.join(TEXT_FN, 'spanish.txt'), 'r', 'utf8') as f: assert article.text == f.read()
def test_arabic_fulltext_extract(self): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/'\ 'index.html' mock_response_with(url, 'arabic_article') article = Article(url=url) article.build() assert article.meta_lang == 'ar' with codecs.open(os.path.join(TEXT_FN, 'arabic.txt'), 'r', 'utf8') as f: assert article.text == f.read()
def _retrive_content(url): article = Article(url) success = False try: article.build() success = True except ArticleException as e: sterr.write(e) finally: return article, success
def article_list(self): for link in self.links: art = Article(link) try: art.build() art.nlp() except: pass finally: self.art_list.append(art)
def test(): #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories' #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil' url = 'http://www.bbc.com/news/world-europe-35828810' #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/' a = Article(url) a.build() loc = get_news_location(a, num_of_location=3) print (loc) try: print (detect(a.text)) except lang_detect_exception.LangDetectException: print ("Not English")
def test_save_article_function(): from newspaper import Article today = time.time() today = datetime.datetime.fromtimestamp(today) url = 'http://www.bbc.com/news/world-europe-35828810' #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/' a = Article(url) a.build() #print (a.title, a.publish_date) #if the news has no publish_date, set it to today if a.publish_date is None: a.publish_date = today path_to_save = get_path_to_save(a) data_a = get_serialized_article_obj(a) create_file(path_to_save, data = data_a)
def get_actual_url(x): global count count = count + 1 print(count) try: a = Article(x) a.build() except: return x else: if a.meta_data['url']: return a.meta_data['url'] else: try: return a.meta_data['og']['url'] except: return x
def build_articles(self, links): '''create document''' with open(self.name, "w") as document: with open(self.bad_links, 'a+') as bad_links: for link in links: if link not in bad_links: try: article = Article(link) article.build() self.content(article, document) self.success += 1 except: self.error += 1 links.remove(link) bad_links.write(link) continue print(self.count) print(self.x) return links
def getArticle(url, company, sec_code): try: print(company, sec_code) article = Article(url) article.download() article.build() article.parse() article.nlp() ans = {} hsh = hashlib.md5(article.title.encode()) hsh = hsh.hexdigest() ans['_id'] = str(hsh) ans['title'] = str(article.title) ans['summary'] = str(article.summary).replace('\n', '') if article.publish_date == None: ans['publish_date'] = str(datetime.now().date()) ans['publish_time'] = str(datetime.now().time()) else: ans['publish_date'] = str(article.publish_date.date()) ans['publish_time'] = str(article.publish_date.time()) ans['authors'] = article.authors ans['source'] = str(article.source_url) ans['company'] = company ans['Security'] = sec_code ans['category'] = 'news' ans['keywords'] = article.keywords sd = [] st = [] try: matches = datefinder.find_dates(article.summary) for match in set(matches): sd.append(str(match.date())) st.append(str(match.time())) except: pass ans['story_dates'] = sd ans['story_time'] = st news.append(ans) insert_into_db(ans) except: pass print("Success - " + " " + url)
def newspaper_parser(self, sleep_time=0): print('41 running newspaper_parser()...') results = [] count = 0 #print(self.links) for l in self.links: article = Article(url=l) try: article.build() print(article.summary) except: time.sleep(60) continue data = { 'title': article.title, 'date_published': article.publish_date, 'news_outlet': self.newspaper, 'authors': article.authors, 'feature_img': article.top_image, 'article_link': article.canonical_link, 'keywords': article.keywords, 'movies': article.movies, 'summary': article.summary, 'text': article.text, 'html': article.html } print(data['title']) #print(data['publish_date']) #print (data['text']) #print("") print("") results.append(data) count += 1 #print (count) time.sleep(sleep_time) return results
def test(): #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories' #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil' #url = 'http://www.bbc.com/news/world-europe-35828810' #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/' url = 'http://www.nytimes.com/2016/03/19/world/europe/dubai-airliner-crashes-while-trying-to-land-at-russian-airport.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region®ion=top-news&WT.nav=top-news&_r=1' print ("building:", url) a = Article(url) a.build() process_and_save_article(a) print ("first paragraph") print (a.text.split('\n')[0]) print ("Summary:") print (a.summary) try: print (detect(a.text)) except lang_detect_exception.LangDetectException: print ("Not English")
def build_news_article_from_url(source_url, sNLP): """build new article object from source url, if build fail would return None """ try: print('start to scrape from url: ', source_url) # pre-process news by NewsPaper3k and Boilerpipe library article = Article(source_url, keep_article_html=True) article.build() article.nlp() e = Extractor(extractor='DefaultExtractor', html=article.html) article.text = e.getText() article.article_html = e.getHTML() news_article = NewsArticle(article, sNLP) print('success to scrape from url: ', source_url) return news_article except Exception as e: print('fail to scrape from url: ', source_url) print('reason:', e) return None
def aggregate(): ArticleRec.objects.filter( article_published__lte=datetime.datetime.today() - datetime.timedelta(days=7)).delete() for f in shuffle(FeedRec.objects.all()): u = f.feed_url print(u) article_list = grab_rss(f) x = 0 for a in article_list: x += 1 print("Checking article: " + str(x)) article = Article(url=a.url) try: article.build() except (ArticleException, UnicodeDecodeError, ValueError): print("Error: ArticleException") continue a.content = parser.parse(article.text)['text'] print(len(a.content)) if len(a.content) < 50: print("Error: Too short") continue a.tag = clf.predict([article.text])[0] width, height = get_image_size(article.top_image) if width > 100 or height > 100: a.img = article.top_image add_article(a)
def aggregate(): ArticleRec.objects.filter(article_published__lte= datetime.datetime.today()-datetime.timedelta(days=7)).delete() for f in shuffle(FeedRec.objects.all()): u = f.feed_url print(u) article_list = grab_rss(f) x = 0 for a in article_list: x += 1 print("Checking article: " + str(x)) article = Article(url=a.url) try: article.build() except (ArticleException, UnicodeDecodeError, ValueError): print("Error: ArticleException") continue a.content = parser.parse(article.text)['text'] print(len(a.content)) if len(a.content) < 50: print("Error: Too short") continue a.tag = clf.predict([article.text])[0] width, height = get_image_size(article.top_image) if width > 100 or height > 100: a.img = article.top_image add_article(a)
def test(): #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories' #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil' #url = 'http://www.bbc.com/news/world-europe-35828810' #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/' urls = [] #urls.append('') urls.append('http://www.bbc.com/news/world-australia-35800175') #urls.append('http://edition.cnn.com/2016/03/21/politics/bernie-sanders-wins-democrats-abroad/index.html') #urls.append('http://www.huffingtonpost.com/jonathan-greenberg/three-reasons-bernie-sand_b_9538508.html') #urls.append('http://ewn.co.za/2016/03/25/Nigeria-targets-300-army-officers-and-firms-in-widening-corruption-probe') for url in urls: print ("building:", url) a = Article(url) a.build() process_and_save_article(a) try: print (detect(a.text)) except lang_detect_exception.LangDetectException: print ("Not English")
def link_parser(link): parsed_uri = urlparse(link) source = '{uri.netloc}'.format(uri=parsed_uri) domain = extract(link).domain article = Article(link) article.build() try: full_text = article.text except: full_text = None pass image = article.top_image keywords = article.keywords summary = article.summary title = article.title try: published_at = extractArticlePublishedDate(link) except Exception as e: published_at = None print(e) print("\n\n\n") pass try: language = article.meta_lang except: language = None pass try: author = article.authors except: author = None pass """ places = get_location.get_place_context(text=description) location = { "countries": places.countries, "country_mentions" : places.country_mentions, "cities" : places.cities, "city_mentions" : places.city_mentions } """ if image != "" and full_text != "" and title != "": dic = { 'url': link, 'im': image, 'title': title, 'domain': domain, 'full_text': full_text, 'summary': summary, 'keywords': keywords, 'source': source, 'published_at': published_at, 'language': language, 'author': author } print('done') return dic
class ArticleTestCase(unittest.TestCase): def runTest(self): self.test_url() self.test_download_html() self.test_pre_download_parse() self.test_parse_html() self.test_meta_type_extraction() self.test_meta_extraction() self.test_pre_parse_nlp() self.test_nlp_body() def setUp(self): """called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') def tearDown(self): """Called after all test cases finish of this unit """ pass @print_test def test_url(self): assert self.article.url == ( u'http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html') @print_test @responses.activate def test_download_html(self): mock_response_with(self.article.url, 'cnn_article') self.article.download() assert len(self.article.html) == 75244 @print_test def test_pre_download_parse(self): """Before we download an article you should not be parsing! """ article = Article(self.article.url) def failfunc(): article.parse() self.assertRaises(ArticleException, failfunc) @print_test @responses.activate def test_parse_html(self): TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') DOMAIN = 'www.cnn.com' SCHEME = 'http' AUTHORS = ['Dana Ford', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' mock_response_with(self.article.url, 'cnn_article') self.article.build() with open(os.path.join(TEXT_FN, 'cnn.txt'), 'r') as f: assert self.article.text == f.read() assert self.article.top_img == TOP_IMG assert self.article.authors == AUTHORS assert self.article.title == TITLE assert len(self.article.imgs) == LEN_IMGS assert self.article.meta_lang == META_LANG @print_test @responses.activate def test_meta_type_extraction(self): mock_response_with(self.article.url, 'cnn_article') self.article.build() meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) assert 'article' == meta_type @print_test @responses.activate def test_meta_extraction(self): mock_response_with(self.article.url, 'cnn_article') self.article.build() meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict( dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': { 'site_name': 'CNN', 'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article' }, 'section': 'travel', 'author': 'Dana Ford and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': { 'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html' }, 'source': 'CNN', 'fb': { 'page_id': 18793419640, 'app_id': 80401312489 }, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': { 'publisher': 'https://www.facebook.com/cnninternational' }, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': { 'site': { 'identifier': '@CNNI', 'id': 2097571 }, 'card': 'summary', 'creator': { 'identifier': '@cnntravel', 'id': 174377718 } }, 'viewport': 'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) assert meta == META_DATA # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = filter(lambda v: isinstance(v, dict), meta.values()) assert all(map(lambda d: len(d) > 0, dict_values)) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) assert len(filter(is_dict, meta.values())) == 5 # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, types.StringTypes) assert len(filter(is_string, meta.values())) == 12 @print_test @responses.activate def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article """ mock_response_with(self.article.url, 'cnn_article') def failfunc(): self.article.nlp() self.assertRaises(ArticleException, failfunc) @print_test def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ article = Article(self.article.url) article.download() def failfunc(): article.nlp() self.assertRaises(ArticleException, failfunc) @print_test @responses.activate def test_nlp_body(self): SUMMARY = """Wish the forecasters were wrong all the time :)"Though the worst of the storm has passed, winds could still pose a problem.\r\nForecasters see mostly smooth sailing into Thanksgiving.\r\nThe forecast has left up in the air the fate of the balloons in Macy's Thanksgiving Day Parade.\r\nThe storm caused some complications and inconveniences, but no major delays or breakdowns.\r\n"That's good news for people like Latasha Abney, who joined the more than 43 million Americans expected by AAA to travel over the Thanksgiving holiday weekend.""" KEYWORDS = [ u'great', u'good', u'flight', u'sailing', u'delays', u'smooth', u'thanksgiving', u'snow', u'weather', u'york', u'storm', u'winds', u'balloons', u'forecasters' ] mock_response_with(self.article.url, 'cnn_article') self.article.build() self.article.nlp() # print self.article.summary # print self.article.keywords assert self.article.summary == SUMMARY assert self.article.keywords == KEYWORDS
#크롤링할 url 주소 입력 # url = 'http://v.media.daum.net/v/20170604205121164' # url = "https://m.blog.naver.com/heerok93/221076782232" # url = "https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html" url = "http://sports.news.naver.com/wfootball/schedule/index.nhn" #=========================================== #언어가 한국어이므로 language='ko'로 설정 a = Article(url, language='ko') a.download() a.parse() #기사 제목 가져오기 print(a.title) #기사 내용 가져오기(150자) # print(a.text) with open('newspp.html', 'w') as f: # text = fulltext(a.html) f.write(a.html) a.build() with open('newspp_1.html', 'w') as f: # text = fulltext(a.html) f.write(a.html) print(a.publish_date) print(a.images) #=========================================== # nb = newspaper.build(url) # # for article in nb.category_urls(): # print(article) #===========================================
def newspaper_parser(self, newspaper, links, topic, sleep_time=2): print("[System]: newspaper_parser Activated") results = [] count = 0 # links = ['https://www.ynetnews.com/article/H1zKfsc9L'] for l in links: article = Article(url=l) try: article.build() except Exception as e: print("Error 75:", e) time.sleep(10) continue date = article.publish_date.strftime("%d/%m/%Y") if self.dateStart: if self.check_dates(date): print("[System]: date is ok") else: print("[System]: date is out of range") continue if newspaper == "n12": authors = self.findN12Authors(l) else: authors = article.authors data = { 'title': article.title, 'genre': topic, 'date_published': date, 'news_outlet': newspaper, 'authors': authors, 'feature_img': article.top_image, 'link': article.canonical_link, 'keywords': article.keywords, 'summary': article.summary, 'text': article.text # 'movies': (article.movies).tolist(), # 'html': article.html } print("title:", data['title']) if count < 1: # print 1 article # print("data['title']") # print(data['title']) print( "-----------------------------Article dit---------------------------" ) print("date_published:", data['date_published']) print("genre:", data['genre']) print("authors:", data['authors']) print("link:", data['link']) print("keywords:", data['keywords']) print("summary:", data['summary']) print( "--------------------------------------------------------") print("text:", data['text']) print( "--------------------------------------------------------") # print # print if data['text']: self.add_article(newspaper, data) count += 1 print(count) time.sleep(sleep_time) return results
class ArticleTestCase(unittest.TestCase): def runTest(self): print 'testing article unit' self.test_url() self.test_download_html() self.test_pre_download_parse() self.test_parse_html() self.test_meta_type_extraction() self.test_meta_extraction() self.test_pre_parse_nlp() self.test_nlp_body() def setUp(self): """called before the first test case of this unit begins""" self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') def tearDown(self): """Called after all test cases finish of this unit """ pass @print_test def test_url(self): assert self.article.url == ( u'http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html') @print_test @responses.activate def test_download_html(self): mock_response_with(self.article.url, 'cnn_article') self.article.download() assert len(self.article.html) == 75244 @print_test def test_pre_download_parse(self): """Before we download an article you should not be parsing! """ article = Article(self.article.url) def failfunc(): article.parse() self.assertRaises(ArticleException, failfunc) @print_test @responses.activate def test_parse_html(self): TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') DOMAIN = 'www.cnn.com' SCHEME = 'http' AUTHORS = ['Dana Ford', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' mock_response_with(self.article.url, 'cnn_article') self.article.build() with open(os.path.join(TEST_DIR, 'data/cnn.txt'), 'r') as f: assert self.article.text == f.read() assert self.article.top_img == TOP_IMG assert self.article.authors == AUTHORS assert self.article.title == TITLE assert len(self.article.imgs) == LEN_IMGS assert self.article.meta_lang == META_LANG @print_test @responses.activate def test_meta_type_extraction(self): mock_response_with(self.article.url, 'cnn_article') self.article.build() meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) assert 'article' == meta_type @print_test @responses.activate def test_meta_extraction(self): mock_response_with(self.article.url, 'cnn_article') self.article.build() meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict(dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': {'site_name': 'CNN','description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'}, 'section': 'travel', 'author': 'Dana Ford and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': {'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'}, 'source': 'CNN', 'fb': {'page_id': 18793419640, 'app_id': 80401312489}, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': {'publisher': 'https://www.facebook.com/cnninternational'}, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': {'site': {'identifier': '@CNNI', 'id': 2097571}, 'card': 'summary', 'creator': {'identifier': '@cnntravel', 'id': 174377718}}, 'viewport':'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) assert meta == META_DATA # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = filter(lambda v: isinstance(v, dict), meta.values()) assert all(map(lambda d: len(d) > 0, dict_values)) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) assert len(filter(is_dict, meta.values())) == 5 # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, types.StringTypes) assert len(filter(is_string, meta.values())) == 12 @print_test @responses.activate def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article""" mock_response_with(self.article.url, 'cnn_article') def failfunc(): self.article.nlp() self.assertRaises(ArticleException, failfunc) @print_test def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article""" article = Article(self.article.url) article.download() def failfunc(): article.nlp() self.assertRaises(ArticleException, failfunc) @print_test @responses.activate def test_nlp_body(self): SUMMARY = """Wish the forecasters were wrong all the time :)"Though the worst of the storm has passed, winds could still pose a problem.\r\nForecasters see mostly smooth sailing into Thanksgiving.\r\nThe forecast has left up in the air the fate of the balloons in Macy's Thanksgiving Day Parade.\r\nThe storm caused some complications and inconveniences, but no major delays or breakdowns.\r\n"That's good news for people like Latasha Abney, who joined the more than 43 million Americans expected by AAA to travel over the Thanksgiving holiday weekend.""" KEYWORDS = [ u'great', u'good', u'flight', u'sailing', u'delays', u'smooth', u'thanksgiving', u'snow', u'weather', u'york', u'storm', u'winds', u'balloons', u'forecasters'] mock_response_with(self.article.url, 'cnn_article') self.article.build() self.article.nlp() # print self.article.summary # print self.article.keywords assert self.article.summary == SUMMARY assert self.article.keywords == KEYWORDS