def parse_reuters(self): try: self.newsdata = self.session.get( self.url).json()["wireitems"][0]["templates"][0]["story"] self.article = newspaper.fulltext(self.newsdata["body"], language=self.language) except Exception as e: print(e) return [] try: self.caption = self.newsdata["images"][0]["caption"] self.picture = self.newsdata["images"][0]["url"] + "&w=200.0" self.resize = False except: pass try: location = self.newsdata["dateline"] if " (Reuters)" in location: self.location = location.split(" (Reuters)")[0].split("/")[0] elif "[" in self.article and "]" in self.article: self.location = self.article.split("[")[1].split("]")[0].split( "日 ロイター")[0][:-1] if self.location == "": self.location = None except: pass
def search(self, topic, max=1, c=100): """ Takes the input topic and collects next 100 news articles based on max paramter. Collects article text and author. """ # Searches for headlines and gets article data json = self.api.get_everything(q=topic, language='en', sort_by='relevancy', page=max, page_size=c) articles = json['articles'] news = list() for article in articles: # Takes the URL of the article and downloads the full content a = Article(article['url']) a.download() # If download failed, continue to next article try: # Takes the whole article's text and adds it to output text = fulltext(a.html) current = dict() current['text'] = text current['author'] = article['author'] news.append(current) except Exception as e: pass return news, max + 1
def getarticle(readfile): ''' get the article and save it in a different file ''' try: fileopen = open(readfile) except IOError: print "file " + readfile + " not in the location specified" return i = 1 for line in fileopen: try: ua = generate_user_agent() head = ua.encode('ascii', 'ignore') headers = {'useragent': head} print "reading article :" print line html = requests.get(line, headers=headers).text tex = fulltext(html) writefile = "201604" + str(j) + "_" + str(i) + ".txt" with io.open(writefile, encoding='utf-8', mode='w+') as ns: strng = ' '.join(tex.split()) ns.write(strng) ns.close() i = i + 1 except: pass
def alt_extract_info(tab, driver, url): cookies = driver.get_cookies() s = requests.Session() for cookie in cookies: s.cookies.set(cookie['name'], cookie['value']) article = s.get(url) text = fulltext(article.text)
def clean_article(content): """Converts html text into article text""" result = '' try: result = fulltext(content) finally: # Catch-all to ensure all broken html is discarded return result
def parse_reuters(self): try: self.newsdata = self.session.get( self.url).json()["wireitems"][0]["templates"][0]["story"] except Exception as e: print(e) return [] try: self.article = newspaper.fulltext(self.newsdata["body"], language=self.language) self.caption = self.newsdata["images"][0]["caption"] self.picture = self.newsdata["images"][0]["url"] + "&w=200.0" self.resize = False except Exception as e: print(e) try: location = self.newsdata["dateline"] if location != "(Reuters)": self.location = location.split(" (Reuters)")[0].split("/")[0] elif "\uff3b" in self.location and "\u3000" in self.location: self.location = self.article.split("\uff3b")[1].split( "\u3000")[0] print(self.location) except Exception as e: print(e)
def normalize_text(html): try: url_re = re.compile("https{0,1}://[^\s]+") url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*") space_re = re.compile("[\s]{2,}") html = html.encode("ascii", errors="ignore") text = newspaper.fulltext(html) sent = text.encode('ascii', errors='ignore') sent = str(sent).replace("r\\", "") sent = str(sent).replace("n\\", "") sent = str(sent).replace("\\", "") text = sent t, d = MosesTokenizer(), MosesDetokenizer() tokens = t.tokenize(text) detokens = d.detokenize(tokens) text = " ".join(detokens) # Removing URLs text = url_re.sub(" ", text) text = url2_re.sub(" ", text) # Removing multiple spacing characters text = space_re.sub(" ", text) text = text.encode("ascii", errors="ignore").decode() text = preProcess(text) # Stripping leading and trailing spaces text = text.strip() return text except Exception as e: return ""
def run_newspaper(htmlstring): '''try with the newspaper module''' try: text = fulltext(htmlstring) # sanitize(fulltext(htmlstring)) except AttributeError: return '' return text
def getarticle(readfile): ''' get the article and save it in a different file ''' try: fileopen = open(readfile) except IOError: print "file " + readfile + " not in the location specified" return i = 1 for line in fileopen: try: ua = generate_user_agent() head = ua.encode('ascii', 'ignore') headers = {'useragent':head} print "reading article :" print line html = requests.get(line, headers = headers).text tex = fulltext(html) writefile = "201604"+str(j)+"_"+str(i)+".txt" with io.open(writefile, encoding='utf-8', mode='w+') as ns: strng = ' '.join(tex.split()) ns.write(strng) ns.close() i = i + 1 except: pass
def fetch_bookmarks(urls): obj = Scrape_Filter() bookmark_data = dict() start_scrapy(urls) for url in fetched: req = fetched[url] soup = BeautifulSoup(req, 'html5lib') obj.check_lang(soup) try: text = fulltext(req) except: article = Article(url) article.download() article.parse() text = article.text title = obj.get_title(soup) desc_keywords = obj.get_keywords_and_description(soup) content = obj.filter_text(text) bookmark_data[url] = dict() bookmark_data[url]["title"] = title bookmark_data[url]["desc"] = desc_keywords bookmark_data[url]["content"] = content return bookmark_data
def test_parse_html(self): self.setup_stage('parse') AUTHORS = [ 'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins' ] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' META_SITE_NAME = 'CNN' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.assertEqual(text, self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual(META_SITE_NAME, self.article.meta_site_name) self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))
def test_parse_html(self): self.setup_stage('parse') AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.assertEqual(text, self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date))
def extract(html, language='ru'): try: text = fulltext(html=html, language=language) except: text = '' return text
def summarize(url): from newspaper import fulltext import requests text = fulltext(requests.get(url).text) model = Summarizer() result = model(text, ratio=0.1) full = ''.join(result) return full
def test_method(url): article = get_article(url) req = requests.get(url).text tst = req.parse() parser = MyHTMLParser() tst = parser.feed(req) text = fulltext(req)
def attempt2(): header = { "Accept-Encoding": "gzip", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" } html = requests.get(url, headers=header).text text = fulltext(html) print(text)
def remove_image(link): try: time.sleep(5) article_soup = get_html(link) article_soup.find('figure').decompose() return fulltext(str(article_soup.currentTag)) except Exception as e: pass
def get_text(url): from newspaper import fulltext import requests url = str(url) text = fulltext(requests.get(url).text) return text
def test_japanese_fulltext_extract2(self): url = 'http://www.afpbb.com/articles/-/3178894' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article2', 'html') article.download(html) article.parse() text = mock_resource_with('japanese2', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ja'))
async def get_url(url, returned_format=None): if returned_format == 'html': print( '[!] HTML support is being refactored. Currently data is being returned plaintext' ) r = requests.get(url) b = newspaper.fulltext(r.text) return str(b).replace('\n', '<br>') if b else None
def test_latvian_fulltext_extract(self): url = 'https://www.lsm.lv/raksts/zinas/arzemes/norvegija-pec-zemes-nogruvuma-pieci-bojagajusie.a387519/' article = Article(url=url, language='lv') html = mock_resource_with('latvian_article', 'html') article.download(html) article.parse() text = mock_resource_with('latvian', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'lv'))
def test_thai_fulltext_extract(self): url = 'https://prachatai.com/journal/2019/01/80642' article = Article(url=url, language='th') html = mock_resource_with('thai_article', 'html') article.download(html) article.parse() text = mock_resource_with('thai', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'th'))
def test_japanese_fulltext_extract(self): url = 'https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/?n_cid=DSTPCS001' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article', 'html') article.download(html) article.parse() text = mock_resource_with('japanese', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ja'))
def test_chinese_fulltext_extract(self): url = 'http://news.sohu.com/20050601/n225789219.shtml' article = Article(url=url, language='zh') html = mock_resource_with('chinese_article', 'html') article.download(html) article.parse() text = mock_resource_with('chinese', 'txt') assert article.text == text assert fulltext(article.html, 'zh') == text
def test_chinese_fulltext_extract(self): url = 'http://news.sohu.com/20050601/n225789219.shtml' article = Article(url=url, language='zh') html = mock_resource_with('chinese_article', 'html') article.download(html) article.parse() text = mock_resource_with('chinese', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'zh'))
def get_html(): news_url = request.args.get("url") try: if (news_url and len(news_url)): html = requests.get(news_url).text return render_template('news-summary.html', news_html=fulltext(html)) except Exception as e: return render_template('news-summary.html', news_html=e)
def newspaper_extractor(self,html): try: content = fulltext(html) if content and content != "": return content else: return self.readability_extractor(html) except: return self.readability_extractor(html)
async def get_url(self, url, returned_format=None): if returned_format == 'html': logging.info( '[!] HTML support is being refactored. Currently data is being returned plaintext' ) r = self.get_response_from_url(url) # Use the response text to get contents for this url b = newspaper.fulltext(r.text) return str(b).replace('\n', '<br>') if b else None
def analysis_news_content(self, html, obj, newspaper=True): if newspaper: text = fulltext(html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) return content else: content_list= obj.xpath('//div[@class="section-content"]//text()')[7:] content = '<p>'.join([i.replace("\n", '').strip() for i in content_list]).replace("<p><p>", '<p>') return content
def test_spanish_fulltext_extract(self): url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal' \ 'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html' article = Article(url=url, language='es') html = mock_resource_with('spanish_article', 'html') article.download(html) article.parse() text = mock_resource_with('spanish', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'es'))
def test_spanish_fulltext_extract(self): url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal'\ 'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html' article = Article(url=url, language='es') html = mock_resource_with('spanish_article', 'html') article.download(html) article.parse() text = mock_resource_with('spanish', 'txt') assert article.text == text assert fulltext(article.html, 'es') == text
def test_arabic_fulltext_extract(self): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/'\ 'index.html' article = Article(url=url) html = mock_resource_with('arabic_article', 'html') article.download(html) article.parse() assert article.meta_lang == 'ar' text = mock_resource_with('arabic', 'txt') assert article.text == text assert fulltext(article.html, 'ar') == text
def analysis_news_content(self, html, html_obj, newspaper=False): if newspaper: text = fulltext(html).split('\n') txt = list(filter(lambda x: x.strip() != '', text)) content = '<p>'.join(txt) else: content_list = html_obj.xpath('//div[@id="content"]//p//text()') content = '<p>'.join([ i.replace("\n", '').strip() for i in content_list ]).replace("<p><p>", '<p>') return content
def test_arabic_fulltext_extract(self): url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/' \ 'index.html' article = Article(url=url) html = mock_resource_with('arabic_article', 'html') article.download(html) article.parse() self.assertEqual('ar', article.meta_lang) text = mock_resource_with('arabic', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ar'))
def googleparser(topic, depth): ''' topic - тема новостного запроса depth - число опрашиваемых страниц поисковой выдачи Возвращает два списка: с текстами новостей и с проблемными ссылками ''' # 1. формирование массива ссылок gnews_links = [] gnews = [] googlenews.search(topic) start_time = time.time() print('--- Формируется массив ссылок... ---') for i in range(1, depth): googlenews.clear() googlenews.getpage(i) for j in range(0, len(googlenews.gettext())): gnews.append(googlenews.gettext()[j]) gnews_links.append(googlenews.getlinks()[j]) print("--- На формирование массива затрачено %s секунд ---" % (time.time() - start_time)) print('--- Завершено. Получено %s ссылок ---' % len(gnews_links)) # 2. выгрузка новостей и формирование массива текстов body = [] count = 0 error_link = [] #массив с битыми ссылками #замеряем время start_time = time.time() print('--- Выгружаются новости... ---') for url in gnews_links: try: html = requests.get(url).text text = fulltext(html) body.append(text) except: error_link.append( gnews_links[count] ) #иногда попадаются проблемные ссылки. Здесь мы будем сохранять их pass count += 1 print("--- Завершено. На выгрузку затрачено %s секунд ---" % (time.time() - start_time)) return body, error_link
def test_japanese_fulltext_extract(self): try: url = 'http://www.cnn.co.jp/tech/35087106.html' article = Article(url=url, language='ja') html = mock_resource_with('japanese_article', 'html') article.download(html) article.parse() text = mock_resource_with('japanese', 'txt') self.assertEqual(text, article.text) self.assertEqual(text, fulltext(article.html, 'ja')) except Exception as e: print('ERR', str(e))
def test_parse_html(self): AUTHORS = ['Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins'] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') assert self.article.text == text assert fulltext(self.article.html) == text # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') assert self.article.top_img == TOP_IMG assert sorted(self.article.authors) == AUTHORS assert self.article.title == TITLE assert len(self.article.imgs) == LEN_IMGS assert self.article.meta_lang == META_LANG assert str(self.article.publish_date) == '2013-11-27 00:00:00'
article.summary ## THINGS WE NEED # source # url # title # date # summary # keywords import newspaper from newspaper import Article cnn_paper = newspaper.build(u'http://cnn.com') for article in cnn_paper.articles: print(article.url) for category in cnn_paper.category_urls(): print(category) cnn_article = cnn_paper.articles[0] cnn_article.download() cnn_article.parse() cnn_article.nlp() from newspaper import fulltext html = requests.get(...).text text = fulltext(html)
text = None if not os.path.exists(target_hthml_file): article = Article(url) article.download() print(f"url: {article.url}") print(f"source_url: {article.source_url}") print(f"message: {article.download_exception_msg}") if article.download_exception_msg is None: html = article.html write_text_to_file(target_hthml_file, html) article.parse() print(f"authors: {article.authors}") print(f"publish_date: {article.publish_date}") print(f"text: {article.text}") article.nlp() print(f"key words: {article.keywords}") print(f"summary: {article.summary}") text = article.text else: article = Article(url) html_text = read_text_from_file(target_hthml_file) text = fulltext(html_text,'en') if text is not None: # print(f"text: {text}") # removing stop words and punctuation all_article_words = nltk.word_tokenize(text) article_tokens = [w for w in all_article_words if not w in stop_words] print(f"loaded: {len(article_tokens)} actual from {len(all_article_words)} total words") # TODO stemming
def get_newspaper_text(html): return fulltext(html)
bootstrap_servers=['172.16.129.43:9092']) producer = KafkaProducer(bootstrap_servers=['172.16.129.43:9092']) # 测试 print "start newspaper parser!" for message in consumer: if message is not None: print "xxx" try: jsonValue = json.loads(message.value) html = jsonValue["html"] contentWithOutTag = fulltext(html, language="zh") for useParser in ["lxml"]: # 将无标签正文带回html解析 parseHtml = extractHtml(html, contentWithOutTag, useParser) parseTitle = parseHtml.title() parsePublishDate = parseHtml.publishDate() parseContent = parseHtml.mainContent() if len(parseContent[0]) == 0: parseContent[0] = contentWithOutTag if len(parsePublishDate) == 0 or parsePublishDate == " " or parsePublishDate == None: # 解析不到发布时间 则将发布时间设置为爬虫时间 parsePublishDate = time.asctime(time.localtime(time.time())) print "\n-----------------------------------------------------------------------------\n" print "url:\t", jsonValue["url"] print "标题:\t", parseTitle