def parse(self, response): url = response.url full_url = response.urljoin(url) news = newspaper(full_url) dict_to_return = news.get_dict all_paras = [] for i in response.xpath("//div[@class='fs-17 pt-2 noto-regular']/p"): para = i.xpath(".//text()").get() all_paras.append(para) tags_list = [] for i in response.xpath( "//div[@class='pb-3 text-center fs-12 uk-text-69 noto-regular listed_topics']/a" ): tag = i.xpath(".//text()").get() tags_list.append(tag) full_text = ' \n'.join(all_paras) dict_to_return['text_by_para'] = full_text dict_to_return['tags_list'] = tags_list yield dict_to_return
def write_articles(topic): # link_file_name = 'data/links/news_' + topic + '.csv' link_file_name = 'data/links/moneycontrol_' + topic + '.csv' if not os.path.exists(link_file_name) or os.stat( link_file_name).st_size == 0: print('skipping, link file does not exist') return with open(link_file_name, 'r') as link_file: csv_reader = csv.reader(link_file) link_count = 0 for row in csv_reader: link = row[0] link_count += 1 print(link) news = newspaper(link) news_dict = news.__dict__ # print(news_dict['get_dict']) if news_dict['get_dict']['headline'] == '': return # article_file_name = 'data/articles/articles_' + topic + '.csv' article_file_name = 'data/articles/moneycontrol_' + topic + '.csv' # if os.path.exists(link_file_name) and os.stat(link_file_name).st_size != 0: # print('skipping, article file already exist') # continue with open(article_file_name, 'a+') as article_file: csv_writer = csv.writer(article_file) if link_count == 1: csv_writer.writerow(news_dict['get_dict'].keys()) csv_writer.writerow(news_dict['get_dict'].values()) print('total_links', link_count)
def run(input, client, output_path, index_name): INPUT = str(input) CLIENT = lib.create_connection(client) OUTPUT_PATH = output_path INDEX_NAME = index_name MODE = "url" if not lib.check_index(client=CLIENT, index=INDEX_NAME): lib.logger.debug(f"{INDEX_NAME} not found.") return Exception(f"{INDEX_NAME} not found.") website = newspaper(INPUT) fulltext = website.article try: nltk_download('punkt') except: pass sentences = tokenize.sent_tokenize(fulltext.strip()) scores_sentences = lib.get_scores(CLIENT, INDEX_NAME, sentences) format_scores_sentences = lib.format_scores(sentences, scores_sentences) result = lib.save_result(fulltext, INDEX_NAME, INPUT, format_scores_sentences, OUTPUT_PATH, MODE) return result
def update_sentiment(n_clicks, input_value): news = newspaper(input_value) publish_date = news.date_publish headline = news.headline body = news.article sentiment = round(sid.polarity_scores(body)['compound'], 2) summary = news.summary keywords = ', '.join(news.keywords) authors = ', '.join(news.authors) return sentiment, publish_date, headline, body, summary, keywords, authors
def prediction(): news = [] text = newspaper(data["data"]) news.append(text.article) news = lstm['tokenizer'].texts_to_sequences(news) news = tf.keras.preprocessing.sequence.pad_sequences(news, padding='post', maxlen=256) pred = lstm['model'].predict(news) print('fake' if pred < 0 else 'true')
print(dir) print(f"ATTEMPTING : {url} ({external_link})") r = requests.get(url, timeout=1000, allow_redirects=True) if r.status_code != 200: print(f"WEB ARCHIVE ERROR: {r.status_code}") f2.write(dir) f2.write("\n") continue out['request_response_url'] = r.__dict__['url'] post['archived_link'] = out['request_response_url'] post['original_link'] = external_link # print("NEWSPAPER") n = newspaper(external_link) # pprint(n.get_dict) out['summary_title'] = n.headline out['summary_description'] = n.description out['summary_summary'] = n.summary out['summary_article'] = n.article out['summary_dict'] = n.get_dict # print("CPZ") f.write(json.dumps(out)) f.write("\n") post['title'] = out['summary_title']
# print(driver.save_screenshot("ERROR.png")) pass # Wayback Machine print("WAYBACK MACHINE") wayback = waybackpy.Url(res['external_link']) try: res['archived_url'] = wayback.newest().archive_url except: print("SAVING ON WAYBACK") wayback.save() res['archived_url'] = wayback.newest().archive_url # Newspape Metadata print("NEWSFETCH") n = newspaper(res['external_link']).get_dict res['headline'] = n['headline'] res['summary'] = n['summary'] res['article'] = n['article'] res['description'] = n['description'] res['publication'] = n['publication'] res['date'] = n['date_publish'] res['url'] = n['url'] res['original_url'] = n['url'] f.write(json.dumps(res)) f.write("\n") f.flush() out.append(res)
from PIL import Image # from answers import POSITIVE, NEGATIVE import pytesseract import json, sys import base64 import pathlib import string from newsfetch.news import newspaper PATH = "C:/Users/Joao/Documents/Projetos/TCC/fake-news-detector/genuino/src/" pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' # POSITIVE = ["sim", "s", "certo", "correto", "isso", "isso mesmo", "ok"] # NEGATIVE = ["não", "n", "errado", "incorreto", "não é isso", "está errado", "ok"] news = newspaper('http://www.agendadopoder.com/lstarticle.aspx?id=3623') print(news.article) file = open(PATH + "tokens/base64.json") data = json.load(file) data['img'] = data['img'].replace("data:image/jpeg;base64,", "") img = base64.b64decode(data['img']) filename = PATH + "assets/img/received/img.jpg" with open(filename, 'wb') as f: f.write(img) print(pytesseract.image_to_string(Image.open(filename), lang='por')) # print("O texto está correto?") # res = input() # if res in POSITIVE: # print("Estou analisando a notícia, aguarde um momento...")
def grab_data(): asli = [] for j in websites: response = requests.get(j) soup = BeautifulSoup(response.text,'html.parser') url = soup.find_all('a') for i in range(len(url)) : try: url[i] = url[i]['href'] except: try: url.remove(url[i]) except: pass var=[] for i in url: if i not in var: var.append(i) url = var try: f = open('urlparsed.txt','r') already_parsed = f.read().split('\n') f.close() except: f = open('urlparsed.txt','w') for i in url: try: i =i['href'] except: pass f.write(str(i)) f.write('\n') f.close() try: for i in already_parsed: try: url.remove(i) except: pass for i in url: already_parsed.append(i) f = open('urlparsed.txt','w') for i in already_parsed: try: i =i['href'] except: pass f.write(str(i)) f.write('\n') f.close() except: pass for i in url: try : try: i =i['href'] except: pass if 'https' not in i: if 'http' not in i: i = j+i #print('\n',i) response = requests.get(i,timeout=10) details = newspaper(i) count = len(details.article) publish_date = details.date_publish cr_date = details.date_download description = details.description summary = details.summary category = details.category if count > 1500: if len(description) > 10 or len(summary) > 10: #print("Appended") asli.append(i) else: pass else: pass except: pass headline=[] timestamp=[] AUTHORS =[] SUMMARY=[] date_crawled = [] news_source = [] full = [] img_url = [] keywords=[] url_news=[] types = [] for i in asli: try: chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome('/usr/bin/chromedriver',chrome_options = chrome_options) driver.get(i) details = newspaper(i) if 'bbc' in i: news_source.append('bbc') elif 'techcrunch' in i: news_source.append('techcrunch') elif 'theguardian' in i: news_source.append('theguardian') elif 'voanews' in i: news_source.append('voanews') elif 'abc.net' in i: news_source.append('abc') headline.append(details.headline) timestamp.append(details.date_publish) url_news.append(i) types.append('newspaper') author='' for i in details.authors: author = author + i author =author + ', ' author= author[:-2] AUTHORS.append(author) keyword='' for i in details.keywords: keyword = keyword + i keyword =keyword + ', ' keyword= keyword[:-2] keywords.append(keyword) if len(details.summary) > 10: SUMMARY.append(details.summary) else: SUMMARY.append(details.description) date_crawled.append(details.date_download) full.append(details.article) try: re = driver.find_elements_by_tag_name('img') for i in re: if'.jpg' in i.get_attribute('src'): im = i.get_attribute('src') break; if len(im)>3: img_url.append(im) else: img_url.append(None) except: img_url.append(None) # print('Done inside') driver.close() except: try: driver.close() except: pass pass final = pd.DataFrame({'Title':headline,'Author':AUTHORS,'Summary':SUMMARY, 'full_text':full,'date_published':timestamp, 'date_crawled':date_crawled, 'news_source':news_source,'img':img_url,'keywords':keywords,'url_news':url_news,'Types':types}) for i in final.index: try: t = pd.DataFrame() t =t.append(final.loc[i]) t.reset_index(drop=True, inplace=True) try: count = search(t.loc[0]['Title'],t.loc[0]['news_source']) #print(count) if count < 25 or count==None : test =t.loc[0].to_json() send_data(test,t.loc[0]['news_source']) #print('Data sent') else: pass #print('Skipped') except: test =t.loc[0].to_json() send_data(test,t.loc[0]['news_source']) except Exception as e: pass
def get_title(url): return newspaper(url).headline
def summarize(url, queue): news = newspaper(url) queue.put(summarizer.summarize(news.article, words=100))
def get_keywords(url, queue): news = newspaper(url) queue.put(news.keywords)
## This Python code lets you fetch the News Article Details using the Article Link URL from newsfetch.news import newspaper import json news = newspaper( 'https://edition.cnn.com/travel/article/disney-world-trip-planning-2020/index.html' ) #GET RESULT IN JSON data = { 'Article': [{ 'headline': news.headline, 'author': news.authors, 'publish_date': news.date_publish, 'modify_date': news.date_modify, 'download_date': news.date_download, 'image_url': news.image_url, 'filename': news.filename, 'description': news.description, 'publication': news.publication, 'category': news.category, 'source_domain': news.source_domain, 'article': news.article, 'summary': news.summary, 'keyword': news.keywords, 'title_page': news.title_page, 'title_rss': news.title_rss, 'url': news.uri }] }
def get_sentences(worp,n=15): wordorphrase=worp mode = 1 sentence_list=[] link_read=[] repeats=0 while len(sentence_list)<n: if mode==1: try: google_news = GNews(max_results=75) # google_news = GNews(max_results=2) temp = google_news.get_news(wordorphrase) for i in range(len(temp)): # print(temp[i]['url']) link=temp[i]['url'] print("No of sentences collected:",len(sentence_list)," "*10,end='\r') if repeats==15 or len(sentence_list)>=n: repeats=0 raise Exception if link in link_read: repeats=repeats+1 continue try : requests.get(link,timeout=3) pass except : continue link_read.append(link) news=newspaper(link) article_data=news.article if wordorphrase in article_data: sent=sentences(wordorphrase,article_data) sentence_list.append(str(len(sentence_list)+1)+") "+sent+"<br>") except: mode=2 continue mode=2 elif mode==2: try: url = bing_news.get_search_url(wordorphrase) next_page_url=url while next_page_url is not None: resp = requests.get(next_page_url) html = resp.text results, next_page_url = bing_news.extract_search_results(html, url) # print(len(results)) for result in results: # print(result['url']) print("No of sentences collected:",len(sentence_list)," "*10,end='\r') if repeats==15 or len(sentence_list)>=n: repeats=0 raise Exception if result['url'] in link_read: repeats=repeats+1 continue try : requests.get(link,timeout=3) pass except : continue link_read.append(result['url']) news=newspaper(result['url']) article_data=news.article if wordorphrase in article_data: try: sent=sentences(wordorphrase, article_data) sentence_list.append(str(len(sentence_list)+1)+") "+sent+"<br>") except: continue time.sleep(10) except: mode=3 continue mode=3 elif mode==3: try: url = yahoo_news.get_search_url(wordorphrase) next_page_url=url while next_page_url is not None: resp = requests.get(next_page_url) html = resp.text results, next_page_url = yahoo_news.extract_search_results(html, url) # print(len(results)) for result in results: # print(result['url']) print("No of sentences collected:",len(sentence_list)," "*10,end='\r') if repeats==10 or len(sentence_list)>=n: repeats=0 raise Exception if result['url'] in link_read: repeats=repeats+1 continue try : requests.get(link,timeout=3) pass except : continue link_read.append(result['url']) news=newspaper(result['url']) article_data=news.article if wordorphrase in article_data: try: sent=sentences(wordorphrase, article_data) sentence_list.append(str(len(sentence_list)+1)+") "+sent+"<br>") except: continue time.sleep(10) except: break break # print("Curr Mode: ",mode," "*15) time.sleep(10) print(sentence_list) return " ".join(sentence_list)