def parse_article(url, lang, featured=0, db=connect_db()): cur = db.execute("select * from articles where url=?", (url,)) entries = [dict(id=row[0], url=row[1], title=row[2], image=row[3], text=row[4], authors=row[5], date=row[6], featured=row[7], language=row[8]) for row in cur.fetchall()] if len(entries) >= 1: return entries[0] article = Article(url) article.download() try: article.parse() except: return None title = article.title image = article.top_image text = article.text authors = ",".join(article.authors) date = int(time.mktime(article.publish_date.timetuple())) if type(article.publish_date) is datetime.datetime else 0 db.execute("insert into articles (url, title, image, text, authors, date, featured, language) values (?, ?, ?, ?, ?, ?, ?, ?)", (url, title, image, text, authors, date, featured and len(text) >= 50, lang)) db.commit() idquery = db.execute("select (id) from articles where url=?", (url,)) id = [row[0] for row in idquery.fetchall()][0] return {"id": id, "url": url, "title": title, "image": image, "text": text, "authors": authors, "date": date, "language": lang}
def get_article(): tree_urls = ET.parse("DB_urls.xml") root_urls = tree_urls.getroot() # The problem with English and Chinese can be solved with for field_urls in root_urls.findall("row"): url_urls = field_urls.find("field").text # url_urls = 'http://news.sina.com.cn/c/2014-04-21/204729980947.shtml' # url_urls = 'http://china.caixin.com/2013-12-30/100623243.html' try: response = urllib2.urlopen(url_urls) status = response.code #print "detected webpage code:", status if(status == 404): continue else: a_zh = Article(url_urls, language = 'zh') a_zh.download() a_zh.parse() content_urls = a_zh.text if(content_urls == ''): a_en = Article(url_urls, language = 'en') a_en.download() a_en.parse() content_urls = content_urls + a_en.text if(content_urls != ''): compare_article(url_urls, content_urls) except: pass
def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ new_article = Article(self.article.url) resp = mock_response_with(new_article.url, 'cnn_article') new_article.download(resp) self.assertRaises(ArticleException, new_article.nlp)
def get_details(): url = request.args.get('url', '') if not url: abort(400) if is_image(url): result = { "url": url, "top_image": url, "text": "", } return jsonify(result) article = Article(url) article.download() try: article.parse() except (IOError, UnicodeDecodeError): return '', 422 try: top_image = article.top_image.rsplit('?',1)[0] except AttributeError: top_image = '' result = { "url": url, "top_image": top_image, "text": article.text, } return jsonify(result)
def get_text(url): article = Article(url) download=article.download() parser= article.parse() authors=article.authors publish_date=article.publish_date # TODO: Slice publish date body_text=article.text body_text=body_text.replace('"','\"') body_text=body_text.replace('"','') #nlp=article.nlp() keywords=article.keywords summary=article.summary title=article.title tags=article.tags #print body_text title=strip_non_ascii(title) summary=strip_non_ascii(summary) body_text=strip_non_ascii(body_text) keywords=' '.join(keywords) keywords=strip_non_ascii(keywords) #print (title, summary, authors, publish_date, body_text, keywords) return (title, summary, authors, publish_date, body_text, keywords, tags)
def test_arabic_fulltext_extract(self): url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html" article = Article(url=url, language="ar") article.download() article.parse() with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f: assert article.text == f.read()
def test_spanish_fulltext_extract(self): url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html" article = Article(url=url, language="es") article.download() article.parse() with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f: assert article.text == f.read()
def run(self): logging.debug("run() - [WAIT]") from newspaper import Article ''' Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm ''' NOTES_LIST = [ '118', '117', # '116', # '115', ] for note_id in NOTES_LIST: note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id) note.download() print '*' * 100 # print 'H T M L' # print note.html #print '*' * 100 # print 'T E X T' note.parse() print note.text logging.debug("run() - [DONE]")
def test_chinese_fulltext_extract(self): url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml" article = Article(url=url, language="zh") article.download() article.parse() with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f: assert article.text == f.read()
def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ new_article = Article(self.article.url) html = mock_resource_with('cnn_article', 'html') new_article.download(html) self.assertRaises(ArticleException, new_article.nlp)
def main(): try: headlines = requests.get(headline_url) headlines = json.loads(headlines.text) for headline in headlines['Headlines']: print("Processing Article %s" % headline['Url']) article = Article(headline['Url']) article.download() article.parse() response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80) rdf = json.loads(response.text) for x in rdf: if '_type' in rdf[x] and 'name' in rdf[x]: print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name'])) for instance in rdf[x]['instances']: text = instance['prefix'] + instance['suffix'] blob = TextBlob(text) for sentence in blob.sentences: print(sentence) print(sentence.sentiment.polarity) print('--------------------') #print(rdf) except Exception as e: print ('Error in connect ' , e)
def check_url(args): """ :param (basestr, basestr) url, res_filename: :return: (pubdate_failed, fulltext_failed) """ url, res_filename = args pubdate_failed, fulltext_failed = False, False html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() if a.publish_date is None: pubdate_failed = True except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() pubdate_failed, fulltext_failed = True, True else: correct_text = mock_resource_with(res_filename, 'txt') if not (a.text == correct_text): # print('Diff: ', simplediff.diff(correct_text, a.text)) # `correct_text` holds the reason of failure if failure print('%s -- %s -- %s' % ('Fulltext failed', res_filename, correct_text.strip())) fulltext_failed = True # TODO: assert statements are commented out for full-text # extraction tests because we are constantly tweaking the # algorithm and improving # assert a.text == correct_text return pubdate_failed, fulltext_failed
def main(argv): if len(argv) > 1: htmlist = argv[1] else: htmlist = 'htmlist' # Our permanent config for html cleaning config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True cleaner = Article(url='', config=config) with open(htmlist, 'r') as f: htmfile = f.read().split('\n') raw = [] for htm in htmfile: print (htm) if not htm.endswith("rss.html"): with open(htm, 'r') as f: h = f.read() cleaner.set_html(h) cleaner.parse() sentences = nlp.split_sentences(cleaner.text) #raw.append(sentences]) with open('htm-out', 'a') as f: [f.write(r + '\n') for r in sentences]
def scrapeURLS(inFilPath): texts = [] cache = loadCache() toDelURLs = [] with open(inFilPath) as f: urls = f.readlines() for url in urls: if filter(urlFilters, url): toDelURLs.append(url) if url in cache: txt = cache[url] else: print "Scraping URL %s" % url article = Article(url) article.download() article.parse() txt = article.text.replace("\n", " ").replace(" ", " ").strip() if txt == "" or filter(txtFilter, txt): toDelURLs.append(url) continue cacheURL(url, txt) texts.append(txt) deleteURLs(inFilPath, toDelURLs) return texts
def test_chinese_fulltext_extract(self): url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml' article = Article(url=url, language='zh') article.build() # assert isinstance(article.stopwords_class, StopWordsChinese) with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f: assert article.text == f.read()
def wrap_newspaper(self, web_page): parser = NewspaperArticle(url=web_page.final_url) parser.html = web_page.html parser.is_downloaded = True parser.parse() return parser
def f(url): url_urls = url.text try: response = urllib2.urlopen(url_urls) status = response.code #print "detected webpage code:", status if(status == 404): pass else: a_zh = Article(url_urls, language = 'zh') a_zh.download() a_zh.parse() # content_urls = a_zh.text # if(content_urls == ''): # a_en = Article(url_urls, language = 'en') # a_en.download() # a_en.parse() # content_urls = content_urls + a_en.text # if(content_urls != ''): # pass # # compare_article(url_urls, content_urls) except: pass
def test_download_file_success(self): url = "file://" + os.path.join(HTML_FN, "cnn_article.html") article = Article(url=url) article.download() self.assertEqual(article.download_state, ArticleDownloadState.SUCCESS) self.assertEqual(article.download_exception_msg, None) self.assertEqual(75406, len(article.html))
def test_download_file_failure(self): url = "file://" + os.path.join(HTML_FN, "does_not_exist.html") article = Article(url=url) article.download() self.assertEqual(0, len(article.html)) self.assertEqual(article.download_state, ArticleDownloadState.FAILED_RESPONSE) self.assertEqual(article.download_exception_msg, "No such file or directory")
def parse_input(text, extractor='newspaper'): if isinstance(text, str) or isinstance(text, unicode): if text.startswith(('http://', 'https://')): # Input is a link - need to extract the text from html if extractor.lower() == 'goose': from goose import Goose urlparse = Goose() article = urlparse.extract(url=text) return unicode_to_ascii(article.cleaned_text) else: from newspaper import Article article = Article(text) article.download() article.parse() return unicode_to_ascii(article.text) elif text.endswith('.txt'): # Input is a file - need to read it textfile = open(text, 'rb') article = textfile.read() textfile.close() return unicode_to_ascii(article) else: # Input is a string containing the raw text return unicode_to_ascii(text) else: raise ValueError('Input text must be of type str or unicode.')
def extract(url=None, keep_html=True): """ Attempts to extract article from URL """ a = Article(url, keep_article_html=keep_html) try: a.download() except Exception, e: log.error('Error downloading %s: %s' % (url, str(e)))
def parse_news(self, response): item = ScrapyGooglenewsItem() #only log the warning info from request logging.getLogger("requests").setLevel(logging.WARNING) for href in response.xpath('//h2[@class="title"]/a/@href').extract(): item['link'] = href #use newspaper-0.0.8 to scrape the webpage, then get clean text. article = Article(item['link']) article.download() article.parse() item['title'] = article.title item['text'] = article.text #item['authors'] = article.authors #item['date'] = article.publish_date if response.url.split('&')[-1] == 'topic=w': item['domain'] = 'World' if response.url.split('&')[-1] == 'topic=n': item['domain'] = 'U.S.' if response.url.split('&')[-1] == 'topic=b': item['domain'] = 'Business' if response.url.split('&')[-1] == 'topic=tc': item['domain'] = 'Technology' if response.url.split('&')[-1] == 'topic=e': item['domain'] = 'Entertainment' if response.url.split('&')[-1] == 'topic=s': item['domain'] = 'Sports' if response.url.split('&')[-1] == 'topic=snc': item['domain'] = 'Science' if response.url.split('&')[-1] == 'topic=m': item['domain'] = 'Health' yield item
def extract(self, item): """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of parsing the HTML-Code. :param item: A NewscrawlerItem to parse. :return: ArticleCandidate containing the recovered article data. """ article_candidate = ArticleCandidate() article_candidate.extractor = self._name() article = Article('') article.set_html(item['spider_response'].body) article.parse() article_candidate.title = article.title article_candidate.description = article.meta_description article_candidate.text = article.text article_candidate.topimage = article.top_image article_candidate.author = article.authors if article.publish_date is not None: try: article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S') except ValueError as exception: self.log.debug('%s: Newspaper failed to extract the date in the supported format,' 'Publishing date set to None' % item['url']) article_candidate.language = article.meta_lang return article_candidate
def insert_url(url): conn = sqlite3.connect('publico_news_sqllite3.db') cursor = conn.cursor() # get the article in plain text article = Article(url) article.download() article.parse() date = article.publish_date title = article.title text = article.text item = dict() item['datetime'] = date item['title'] = title item['text'] = text item['category'] = sys.argv[1].split('/')[6] item['link'] = sys.argv[1] item['origLink'] = sys.argv[1] print(item['category']) print(item['datetime']) if not duplicate(item, item['category'], cursor): status = insert_db(item, item['category'], cursor) if status == 1: print(sys.argv[1], "inserted") else: print("Error", status) else: print(url, "already in BD") conn.commit() conn.close()
def makeDocs(): utc = pytz.utc es = Elasticsearch(BONSAI_URL, verify_certs= True) es.indices.delete(index='news', ignore=[400, 404]) es.indices.create(index='news', ignore=400) print "Created" cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False) a = defaultdict(int) cnn_articles = cnn_paper.articles print cnn_paper.size() for i in range(10): article = cnn_articles[i] url = article.url art = Article(url) art.download() art.parse() print art.publish_date print art.text print "Article" + str(i) print art.publish_date is not None print art.text is not None if (art.publish_date is not None) and (art.text is not None): try: doc = { 'domain': 'CNN', 'date': utc.localize(art.publish_date), 'text': art.text } res = es.index(index="news", doc_type='article', id=i, body=doc) print "Doc" + str(i) except: print "Doc not accepted"
def runTest(self): # The "correct" fulltext needs to be manually checked # we have 50 so far FULLTEXT_PREPARED = 50 domain_counters = {} with open(URLS_FILE, 'r') as f: urls = [d.strip() for d in f.readlines() if d.strip()] for url in urls[:FULLTEXT_PREPARED]: domain = get_base_domain(url) if domain in domain_counters: domain_counters[domain] += 1 else: domain_counters[domain] = 1 res_filename = domain + str(domain_counters[domain]) html = mock_resource_with(res_filename, 'html') try: a = Article(url) a.download(html) a.parse() except Exception: print('<< URL: %s parse ERROR >>' % url) traceback.print_exc() continue correct_text = mock_resource_with(res_filename, 'txt') condensed_url = url[:30] + ' ...' print('%s -- fulltext status: %s' % (condensed_url, a.text == correct_text))
def test2(self): articles =[ 'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350', 'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923', 'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401', 'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180' ] articles = [ 'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU', 'http://www.bbc.co.uk/news/uk-wales-35954982', 'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/', 'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/', 'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff', 'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html', 'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/', 'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178', 'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not'] with open("./Output2.txt", "w") as text_file: for url in articles: print(url) a = Article(url) a.download() a.parse() text_file.write(a.text.encode('utf-8')) text_file.write('\n')
def get_article(url): a = Article(url) a.download() a.parse() article = dict() article['title'] = a.title article['publish_date'] = a.published_date article['authors'] = a.authors article['lead_image'] = a.top_image article['movies'] = a.movies article['text'] = a.text article['keywords'] = get_keywords(a.text) # This is more likely to fail. # try: # article.nlp() # article['summary'] = 'This summary is generated: \n ' + a.summary # except Exception: # print Exception # article['summary'] = a.summary return article
def get_image(): url = request.args.get('url', '') if not url: abort(400) if is_image(url): return redirect(url) article = Article(url) article.download() try: article.parse() except (IOError, UnicodeDecodeError): return '', 422 try: top_image = article.top_image.rsplit('?',1)[0] except AttributeError: top_image = '' if not top_image == '': return redirect(top_image) else: return '', 422
def get_article_by_url(url): article = Article(url, fetch_images=False) article.download() if url == "empty": return "nolist" article.parse() return article.text
#soup = BeautifulSoup(html,"html5lib") soup = BeautifulSoup(html, "lxml") print(line) if soup.title: print(soup.title.string) regexp = re.compile("地址|电话") for b in soup.find_all(text=regexp): print(b) for a in soup.find_all('a'): key = a.string if isinstance(key, (str, bytes)): if re.search(pattern1, key): print('**************') print(key) if 'http' in a['href']: try: a = Article(a['href'], language='zh') a.download() a.parse() except newspaper.article.ArticleException: print( 'failed with 404 Client Error: Not Found for url') print(a.text) else: url = urllib.parse.urljoin(line.rstrip(), a['href']) a = Article(url, language='zh') a.download() a.parse() print(a.text) print('**************')
def getTitle(url): article = Article(url) article.download() article.html article.parse() return article.title
from newspaper import Article a = Article( 'http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html', keep_article_html=True) a.download() a.parse() print(a.article_html)
from newspaper import Article url = "https://www.marketwatch.com/story/heres-a-better-buy-and-hold-strategy-using-the-dow-jones-industrial-average-2019-02-26" a = Article(url, language='en') #English a.download() a.parse() print(a.text)
from newspaper import Article import random import string import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np import warnings warnings.filterwarnings('ignore') nltk.download('punkt', quiet=True) article = Article( 'https://www.mayoclinic.org/diseases-conditions/chronic-kidney-disease/symptoms-causes/syc-20354521' ) article.download() article.parse() article.nlp() corpus = article.text ##print(corpus) text = corpus sentence_list = nltk.sent_tokenize(text) ##print(sentence_list) def index_sort(list_var): length = len(list_var) list_index = list(range(0, length))
# Import the libraries from newspaper import Article import nltk from gtts import gTTS import os # Get the article article = Article('https://www.poetryfoundation.org/poems/46945/baa-baa-black-sheep') article.download() # Download the article article.parse() # Parse the article nltk.download('punkt') # Download the 'punkt' package article.nlp() # Apply Natural Language Processing (NLP) # Get the articles text mytext = article.text # Print the text print(mytext) # Language in which you want to convert # language = 'pt-br' #Portuguese (Brazil) language = 'en' # English # Passing the text and language to the engine, # here we have marked slow=False. Which tells # the module that the converted audio should # have a high speed myobj = gTTS(text=mytext, lang=language, slow=False) # Saving the converted audio in a mp3 file named
def triggers(request): if request.method == 'POST': print(request.POST) data = dict(request.POST) # Driver Code key = 'show_details' one = checkKey(data, key) key = 'check_triggers' two = checkKey(data, key) key = 'show_wordcloud' three = checkKey(data, key) key = 'hate_speech' four = checkKey(data, key) print(one, two, three) #URL Link case if (one == True): url = data['Link'][0] print(url) article = Article(url) article.download() article.parse() authors = article.authors publishdate = article.publish_date #article.text article.nlp() keywords = article.keywords articlesummary = article.summary return render( request, 'consciousApp/triggers.html', { 'authors': authors, 'publishdate': publishdate, 'keywords': keywords, 'articlesummary': articlesummary }) #Show triggers elif (two == True): text = request.POST['input_text'].lower() triggers = [ "9 11", "9-11", "9/11", "ableism", "abusive", "ageism", "alcoholism", "animal abuse", "animal death", "animal violence", "bestiality", "gore", "corpse", "bully", "cannibal", "car accident", "child abuse", "childbirth", "classism", "death", "decapitation", "abuse", "drug", "heroin", "cocaine", "eating disorder", "anorexia", "binge eating", "bulimia", "fatphobia", "forced captivity", "holocaust", "hitler", "homophobia", "hostage", "incest", "kidnap", "murder", "nazi", "overdose", "pedophilia", "prostitution", "PTSD", "racism", "racist", "rape", "raping", "scarification", "self-harm", "self harm", "cutting", "sexism", "slavery", "slurs", "suicide", "suicidal", "swearing", "terminal illness", "terrorism", "torture", "transphobia", "violence", "warfare" ] tw = [] text_file = open( './consciousApp/static/consciousApp/input/triggercheckdata.txt', 'w+') text_file.write(str(text)) text_file.close() for trigger in triggers: if text.find(trigger) > -1: tw.append(trigger) if tw == []: tw.append('No Triggers Found') return render(request, 'consciousApp/triggers.html', { 'text': text, 'triggers': tw, 'data': data }) #Show_cloud elif (three == True): text = request.POST['input_text'].lower() tokens = word_tokenize(text) textdata = nltk.Text(tokens) stopwords = set(STOPWORDS) wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=100, background_color="white").generate(text) wordcloud.to_file( "./consciousApp/static/consciousApp/output/word-cloud.png") data = "./../../static/consciousApp/output/word-cloud.png" return render(request, 'consciousApp/triggers.html', {'data': data}) elif (four == True): sonar = Sonar() text = request.POST['input_text'].lower() url = data['Link'][0] data = sonar.ping(text=text)["classes"] hate_speech = data[0] hate_speech_confidence = hate_speech["confidence"] * 100 offensive_language = data[1] offensive_language_confidence = offensive_language[ "confidence"] * 100 neither = data[2] neither_confidence = neither["confidence"] * 100 print(type(data)) print(offensive_language_confidence * 100, hate_speech_confidence * 100, neither_confidence * 100) return render( request, 'consciousApp/triggers.html', { 'hate_speech_confidence': hate_speech_confidence, 'offensive_language_confidence': offensive_language_confidence, 'neither_confidence': neither_confidence }) else: return render(request, 'consciousApp/triggers.html')
from newspaper import Article # A new article from TOI url = "http://world.people.com.cn/n1/2019/0308/c1002-30964972.html" # For different language newspaper refer above table toi_article = Article(url, language='zh') # zh for China # To download the article toi_article.download() # To parse the article toi_article.parse() # To perform natural language processing ie..nlp # toi_article.nlp() # To extract title print("Article's Title:") print(toi_article.title) print("*" * 80) # To extract text print("Article's Text:") print(toi_article.text) print("*" * 80) # To extract summary print("Article's Summary:") print(toi_article.summary) print("*" * 80)
def set_text(self): if not self.text and self.url: a = Article(self.url) a.download() a.parse() self.text = a.text
from newspaper import Article url = 'https://www.gazzetta.it/Calcio/Serie-A/Juventus/06-08-2019/dybala-altri-tesoretto-la-juve-andra-via-come-mandzukic-matuidi-3401461898595.shtml' article = Article(url, language='en') article.download() article.parse() print(article.title, "\n\n") print(article.text) # article.nlp() # print(article.keywords) with open("OUT.txt", "w") as text_file: text_file.write(article.text)
def crawling(self, category_name): # Multi Process PID print(category_name + " PID: " + str(os.getpid())) writer = Writer(category_name=category_name, date=self.date) # 기사 URL 형식 if (category_name == "연합뉴스속보"): url = "http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&sid1=001&sid2=140&oid=001&isYeonhapFlash=Y" \ + "&date=" else: url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( self.categories.get(category_name)) + "&date=" # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다. day_urls = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month']) print(category_name + " Urls are generated") print("The crawler starts") for URL in day_urls: print(URL) regex = re.compile("date=(\d+)") news_date = regex.findall(URL)[0] request = self.get_url_data(URL) document = BeautifulSoup(request.content, 'html.parser') # html - newsflash_body - type06_headline, type06 # 각 페이지에 있는 기사들 가져오기 if (category_name == "연합뉴스속보"): post_temp = document.select('.newsflash_body .type02 li ') else: post_temp = document.select('.newsflash_body .type06_headline li dl') post_temp.extend(document.select('.newsflash_body .type06 li dl')) # 각 페이지에 있는 기사들의 url 저장 post = [] headlines = [] companys = [] for line in post_temp: post.append(line.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 try: companys.append(line.find('span', class_="writing").text) except: companys.append("err") try: h = line.find_all('a') if len(h) > 1: headlines.append(h[1].text) elif len(h) == 1: headlines.append(h[0].text) else: headlines.append("err") except: headlines.append("err") del post_temp print(len(post)) for i in range(len(post)): # 기사 URL # 크롤링 대기 시간 print(i) sleep(0.01) content_url = post[i] # 기사 HTML 가져옴 try: article = Article(content_url, language='ko') article.download() article.parse() text_sentence = article.text.strip() text_company = companys[i] text_headline = headlines[i].strip() ###################################################################### if self.keyword == 'initvalue': wcsv = writer.get_writer_csv() wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url]) else: headline_to_words = text_headline.split() if headline_to_words.index(self.keyword) >= 0: wcsv = writer.get_writer_csv() wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url]) ###################################################################### except Exception as err: print(err) writer.close() return
from gtts import gTTS # text to speech conversion import os # Interacting with operating system from io import BytesIO import ssl try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context nltk.download() # Get the article article = Article( 'https://hackernoon.com/how-to-launch-your-own-blockchain-choosing-the-right-consensus-part-ii-y07y32tv') article.download() # Download the article article.parse() # Parse the article nltk.download('punkt') # Download 'punkt' package article.nlp() # Apply nlp (Natural Language Processing) # Get the article text my_text = article.text # Print the text print(my_text) # Choose language for tts language = 'en' # English language2 = 'fr'
def getData(): url = request.args.get('url') # From Newspaper Framework getting required data content = Article(url) content.download() content.parse() title = content.title rawText = content.text # Unformatted Data to show to user textDisplay = rawText.split("\n\n") textDisplay = ''.join(textDisplay) # Converting numbered text to digits t2d = text2digits.Text2Digits() numText = t2d.convert(rawText) text = numText.split("\n\n") text = ''.join(text) # Implemented API data limit restriction if len(text) < 5000: text = text else: text = text[:5000] jsonData = {"text": text} configDataResource = os.path.join(SITE_ROOT, "data", "configdata.json") configData = json.load(open(configDataResource)) # NER API call request headers = { 'x-api-key': configData["X_API_KEY"], 'Content-type': 'application/json' } ner_response = requests.post( configData["NAMED_ENTITY_RECOGNITION_ENDPOINT"], headers=headers, data=json.dumps(jsonData)) # print(ner_response.text) # Deserializing the response places = lambda: None places.__dict__ = json.loads(ner_response.text) print(places.LOC) json_url = os.path.join(SITE_ROOT, "data", "sg-citi.json") data = json.load(open(json_url)) nlp = spacy.load("en_core_web_sm") doc = nlp(text) LOC = [] CASE = [] for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) if ent.label_ == "CARDINAL": CASE.append(ent.text) if ent.label_ == "GPE": LOC.append(ent.text) count = [] for i in CASE: if i.isdigit(): if i not in count: count.append(i) print("COUNT: ", count) if not len(count): count = list(i for i in range(80, 500, 7)) returnJson = { "text": textDisplay, "location": [], "category": ner_response.text } for i in places.LOC: for citi in data: if i in citi["name"] and citi["name"] not in returnJson["location"]: returnJson["location"].append({ "name": citi["name"], "lat": "no1", "lon": "no2", "count": count[random.randrange(0, len(count))] }) break print(returnJson) return jsonify(returnJson)
def HindustanTimesScrapper(): SRC = KNOWN_NEWS_SOURCES["Hindustan Times"] data1 = get_chronological_headlines(SRC["pages"].format(1)) data2 = get_trending_headlines(SRC["home"]) text_lst = [] url_lst = [] date_lst = [] title_lst = [] try : for data in data1: if data["content"] == "NA": try: article = Article(data["link"]) article.download() article.parse() article.nlp() summary = article.text text_lst.append(summary) except: text_lst.append(data["content"]) else: text_lst.append(data["content"]) url_lst.append(data["link"]) date = data["published_at"] if(date == None) : date = datetime.now() date_lst.append(date) try : title_lst.append(data["title"]) except: title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0]) for data in data2: if data["content"] == "NA": try: article = Article(data["link"]) article.download() article.parse() article.nlp() summary = article.text text_lst.append(summary) except: text_lst.append(data["content"]) else: text_lst.append(data["content"]) url_lst.append(data["link"]) date = data["published_at"] if(date == None) : date = datetime.now() date_lst.append(date) try : title_lst.append(data["title"]) except: title_lst.append(data["content"].replace("\n\n", " ").replace("\n", " ").split(".")[0]) df_raw = pd.DataFrame(list(zip(text_lst, url_lst, date_lst, title_lst)), columns=["text", "url", "date", "headline"]) df_crime = get_crime(df_raw) data = get_data("./database/data.json") df = get_location(df_crime, data) df = preprocessing2(df, data) return df.reset_index(drop=True) except : return pd.DataFrame(columns=["index","text","url","crime","location","region","city","date","headline"])
def parse_content(self, response): #这个函数用作新闻的具体解析 ID = 'songtengteng' website_name = '商务部贸易救济调查局' # 网站板块 website_block = response.xpath( "//div[@class='position']/a[2]/text()").extract_first() news_url = response.meta['url'] # 作者 news_author_list = response.xpath('//script') if len(news_author_list) != 0: news_author = news_author_list.re( 'v.{2}\ss.{4}e\s=\s\"[\u4e00-\u9fa5]+\"')[0][13:].replace( '"', '') else: news_author = '商务部贸易救济调查局' # 新闻发布时间,统一格式:YYYY MM DD HH:Mi:SS v.{2}\stm\s=\s\".*\" publish_time = response.meta['publish_time'] year = publish_time[0:4] month = publish_time[5:7] day = publish_time[8:10] juti_time = publish_time[-8:] publish_time = year + month + day + ' ' + juti_time # 新闻自带标签 news_tags = response.xpath('//script').re( 'v.{2}\sc.+e\s=\s\"[\u4e00-\u9fa5]+\"')[0][14:].replace('"', '') # 新闻标题 news_title = response.xpath('//h3/text()').extract_first() # 新闻正文 a = Article(response.url, language='zh') # Chinese a.download() a.parse() news_content = a.text #获取文章的图片和名称 image_urls = [] image_names = [] image_urls1 = response.xpath( '//p[@class="detailPic"]/img/@src|//div[@class="article_con"]/center/img/@src|//p[@style="text-align: center"]/img/@src' ).extract() if image_urls1 != []: image_urls = image_urls1 for i in range(len(image_urls)): if i < 10 and i >= 0: image_name = news_title + '_000' + str(i) image_names.append(image_name) elif i < 100 and i >= 10: image_name = news_title + '_00' + str(i) image_names.append(image_name) elif i < 1000 and i >= 100: image_name = news_title + '_0' + str(i) image_names.append(image_name) else: image_name = news_title + str(i) image_names.append(image_name) yield self.getItem( id=ID, news_url=news_url, website_name=website_name, website_block=website_block, news_title=news_title, publish_time=publish_time, news_author=news_author, news_tags=news_tags, news_content=news_content, image_urls=image_urls, image_names=image_names, )
import random import string import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np import warnings import yaml import pyaudio import speech_recognition as sr warnings.filterwarnings('ignore') nltk.download('punkt', quiet=True) ### website from where we want to extract the data article1 = Article('https://en.wikipedia.org/wiki/Coronavirus') article1.download() article1.parse() article1.nlp() article2 = Article('https://www.euro.who.int/en/health-topics/noncommunicable-diseases/mental-health/data-and-resources/mental-health-and-covid-19') article2.download() article2.parse() article2.nlp() article3 = Article('https://www.healthline.com/health-news/what-covid-19-is-doing-to-our-mental-health') article3.download() article3.parse() article3.nlp() article4 = Article('https://www.webmd.com/lung/coronavirus')
from newspaper import Article article = Article( 'https://www.firstpost.com/health/narendra-modis-speech-on-coronavirus-pm-announces-total-lockdown-for-three-weeks-but-essential-services-to-remain-open-key-takeways-8185551.html', language='en') article.download() article.parse() article.nlp() print(article.summary)
import spacy from newspaper import Article nlp = spacy.load("en_core_web_sm") url_1 = 'https://www.wsj.com/articles/u-s-officials-walk-out-of-meeting-at-presidential-palace-in-kabul-11553628051' url_2 = 'https://www.wsj.com/articles/iran-moves-to-cement-its-influence-in-syria-11553632926' article_1 = Article(url_1) article_2 = Article(url_2) article_1.download() article_2.download() article_1.parse() article_2.parse() article_stream = [article_1.text, article_2.text] for doc in nlp.pipe(article_stream, batch_size=50): print(doc.vocab) # for entity in doc.ents: # print(entity.text, entity.start_char, entity.end_char, entity.label_)
row_type = row['Type'] url = row['URL'] if row_type != 'None': #print(url) Aritle_URLs.append(url) title_create(filename) Article_Date = [] Article_Title = [] count = 1 for url in Aritle_URLs: print("Number : ", count) article = Article(url) article.download() if article.download_state == 0: print("Retrying in 25 seconds!!") time.sleep(25) article.download() if article.download_state != 1: print("GOOD") article.parse() dated = str(article.publish_date) if dated != 'None': the_date = (dated[:10]) the_date = datetime.datetime.strptime( the_date, '%Y-%M-%d').strftime('%m/%d/%Y') fixed_date = month + the_date[2:10] print(fixed_date)
def get_article_similarity(url1, url2, log_articles=False): try: # Download and parse first article article1 = Article(url1, browser_user_agent=choice(static.HEADERS)) article1.download() article1.parse() article1_text = article1.text # Download and parse second article article2 = Article(url2, browser_user_agent=choice(static.HEADERS)) article2.download() article2.parse() article2_text = article2.text if log_articles: log.debug(f"Article 1: {article1_text}\n\nArticle 2: {article2_text}") # Compare the two articles and return the ratio (0-1) return SequenceMatcher(None, article1_text, article2_text).ratio() except (ArticleException, Exception): log.error(traceback.format_exc()) log.warning("Couldn't compare articles") return None
def getArticle(url): article = Article(url) article.download() article.parse() return article
a = 0 print("ANZAHL: " + str(a)) for feed in feeds: d = feedparser.parse(feed[0]) for entrie in d.entries: #check link url = entrie.link #check date #check if in db #parse #title and try: article = Article(url, language='de', keep_article_html=True) article.download() article.parse() filename = feed[3] + ''.join( random.choices(string.ascii_uppercase + string.digits, k=8)) file = open("html/" + filename + ".html", "w") article.article_html = "<meta property='baseurl' content='" + feed[ 4] + "'>" + article.article_html article.article_html = "<script src='https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js'></script>" + article.article_html article.article_html = "<link href='https://fonts.googleapis.com/css?family=Slabo+27px' rel='stylesheet'>" + article.article_html article.article_html = "<img src='" + article.top_image + "' width='100%' >" + article.article_html article.article_html = "<h1 class='realTitle'>" + entrie.title + "</h1>" + article.article_html
url = "https://www.newindianexpress.com" page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') articles = soup.findAll('a', class_="article_click") news = [] for row in articles: news.append(row['href']) #link = articles[row].find('a')['href'] #news.append(link) dataset = [] for i in news: article = Article(i, language="en") article.download() article.parse() article.nlp() data = {} data['Title'] = article.title data['Text'] = article.text data['Summary'] = article.summary data['Keywords'] = article.keywords dataset.append(data) #print(data) df = pd.DataFrame(dataset) # Importing the dataset uci_dataset = pd.read_csv('OnlineNewsPopularity.csv',
def align_row_text(): source_text = request.form['source_text'] target_text = request.form['target_text'] # check if source and target are urls url_rex = r"(?i)\b(?:(?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\))+(?:\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" if re.fullmatch(url_rex, source_text.strip().lower()): src_article = Article(source_text.strip()) src_article.download() src_article.parse() source_text = src_article.title + "\n" + src_article.text if re.fullmatch(url_rex, target_text.strip().lower()): tar_article = Article(target_text.strip()) tar_article.download() tar_article.parse() target_text = tar_article.title + "\n" + tar_article.text # segment source and target src_lang_code = lang_detect.detect(source_text) tar_lang_code = lang_detect.detect(target_text) if src_lang_code == 'zh-cn': srx_src_code = 'Generic' else: srx_src_code = src_lang_code if tar_lang_code == 'zh-cn': srx_tar_code = 'Generic' else: srx_tar_code = tar_lang_code srx_rules = srx_segmenter.parse(srx_file_path) seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_src_code], source_text) source_list = seg_results.extract()[0] seg_results = srx_segmenter.SrxSegmenter(srx_rules[srx_tar_code], target_text) target_list = seg_results.extract()[0] # translate target target_mt_list = mt_helpers.google_translate_chunk_by_chunk( target_list, tar_lang_code, src_lang_code) # align # initiate the alignment class algorithm = request.form.get('algorithm', 'fuzzy') align_options = { "location_weight": float(request.form.get('input_location_weight', 0.2)), "length_weight": float(request.form.get('input_length_weight', 0.1)), "meta_weight": float(request.form.get('input_length_weight', 0.1)), "semantic_weight": float(request.form.get('input_semantic_weight', 0.6)), "search_range": float(request.form.get('input_paragraph_size', 5)), "minimum_semantic_score": float(request.form.get('input_minimum_semantic_score', 0.5)), "minimum_partial_sem_match": 0.1, "minimum_length_score": float(request.form.get('input_minimum_length_score', 0.6)) } if algorithm == 'fuzzy': semantic_class = fuzzy_comp.FuzzyComp else: semantic_class = tfidf_scikit.TfidfComp alg = TranslationAligner() alg.align(semantic_class, source_list, target_list, [], target_mt_list, options=align_options) # save json file to a random file name under static files and return it with the results temp_file_name = ''.join( random.choices(string.ascii_uppercase + string.digits, k=10)) temp_json_file_name = temp_file_name + ".json" alg.export_json_dict(os.path.join(export_path, temp_json_file_name)) del alg return {"json_file_name": temp_json_file_name}
mycursor = mydb.cursor() sql = "SELECT * FROM Wordpress order by dominio asc" mycursor.execute(sql) sql = mycursor.fetchall() for portal in sql: try: if portal[7] is not None: d = fp.parse(portal[7]) for entry in d.entries: # Check if publish date is provided, if no the article is skipped. # This is done to keep consistency in the data and to keep the script from crashing. if hasattr(entry, 'published'): try: content = Article(entry.link) content.download() content.parse() except Exception as e: # If the download for some reason fails (ex. 404) the script will continue downloading # the next article. print(e) print("continuing...") continue try: twitter = content.meta_data["twitter"] except Exception as e: print("twitter") try: og = content.meta_data["og"] except Exception as e:
def parse_artical(self, response): # 具体文章解析 ID = 'songtengteng' # 新闻链接 news_url = response.meta['url'] # 新闻标题 news_title = response.xpath('//h1/text()').extract_first() # 作者 a = response.xpath( '//div[@class="info-source"]/span/a/text()').extract_first() if a == None: news_author = '' else: news_author = a # 发布时间 publish_time = response.xpath( '//div[@class="info-source"]/span[2]/text()').extract_first() year = publish_time[0:4] month = publish_time[5:7] day = publish_time[8:10] juti_time = publish_time[-5:] publish_time = year + month + day + ' ' + juti_time + ':' + '00' # 正文 '''可以考虑下使用文章密度算法来快速解析文章正文''' a = Article(response.meta['url'], language='zh') # Chinese a.download() a.parse() news_content = a.text # 标签 news_tags = '' #图片 image_urls1 = response.xpath('//p[@class="pi"]/img/@src').extract() image_urls = [] image_names = [] if image_urls1 != []: for i in range(len(image_urls1)): image_url = image_urls1[i] image_urls.append(image_url) if i >= 0 and i < 10: image_title = news_title + '000' + str(i) elif i >= 10 and i < 100: image_title = news_title + '00' + str(i) elif i >= 100 and i < 1000: image_title = news_title + '0' + str(i) else: image_title = news_title + str(i) image_names.append(image_title) yield self.getItem(id=ID, news_url=news_url, website_name='搜狐焦点', website_block='访谈', news_title=news_title, publish_time=publish_time, news_author=news_author, news_tags=news_tags, news_content=news_content, image_urls=image_urls, image_names=image_names)
def get_article(url): article = Article(url, language='en') article.download() article.parse() return article
links.append(i.get('href')) data = {} count = 0 for i in links: urls = 'https://english.mathrubhumi.com/' + i page = requests.get(urls).text soup = BeautifulSoup(page) headline = soup.find("div", {"class": "common_text_en date_outer"}) if headline: date = headline.get_text().strip() date_time = datetime.strptime(date[:-4], '%b %d, %Y, %I:%M %p') parag = soup.find("div", {"class": "articleBody common_text"}) para = parag.find_all('p') place = para[0].get_text().split(':')[0] if date_time > recent: article = Article(urls, 'en') article.download() article.parse() article.nlp() summary = article.summary data[count] = [date_time, place, summary, article.keywords] count += 1 links = [] url = 'https://www.onmanorama.com/districts/' districts = [ 'alappuzha', 'ernakulam', 'idukki', 'kannur', 'kasaragod', 'kollam', 'kottayam', 'kozhikode', 'malappuram', 'palakkad', 'pathanamthitta', 'thiruvananthapuram', 'thrissur', 'wayanad' ] for d in districts:
def scrape_analyze(url): article = Article(url) article.download() article.parse() return article.text
newsPaper = { "rss": value['rss'], "link": value['link'], "articles": [] } for entry in d.entries: if hasattr(entry, 'published'): if count > LIMIT: break article = {} article['link'] = entry.link date = entry.published_parsed article['published'] = datetime.fromtimestamp( mktime(date)).isoformat() try: content = Article(entry.link) content.download() content.parse() except Exception as e: print(e) print("continuing...") continue article['title'] = content.title article['text'] = content.text newsPaper['articles'].append(article) print(count, "articles downloaded from", company, ", url: ", entry.link) count = count + 1 else: # This is the fallback method if a RSS-feed link is not provided. # It uses the python newspaper library to extract articles