def get_news(): urls = get_urls() news = News.query.with_entities(News.source_url).all() used_urls = [] for n in news: used_urls.append(n[0]) for url in urls: if not url in used_urls: used_urls.append(url) article = Article(url, language='pt', keep_article_html=True) article.download() article.parse() article.nlp() news_article = News(url) news_article.slug = slugify(article.title) news_article.title = article.title news_article.text = article.text news_article.top_image = article.top_image news_article.summary = article.summary news_article.article_html = article.article_html news_article.created_at = datetime.datetime.now() exists_this_news = News.query.filter_by(source_url=url).first() if not exists_this_news: print(url) db.session.add(news_article) db.session.commit()
def extract(): url = sys.argv[1:].pop() a = Article(url, keep_article_html=True) a.download() a.parse() a.nlp() parsed_uri = urlparse(a.source_url) domain = '{uri.netloc}'.format(uri=parsed_uri) try: publish_date = a.publish_date.strftime('%Y-%m-%d %H:%M') except AttributeError: publish_date = "" try: authors = ", ".join(a.authors) except AttributeError: authors = "" result = {} result['html'] = a.html result['body'] = a.text result['title'] = a.title result['top_image'] = a.top_image result['author'] = authors result['html_body'] = a.article_html result['favicon'] = a.meta_favicon result['description'] = a.summary result['publish_date'] = publish_date result['keywords'] = a.keywords result['sitename'] = re.sub(r"^www.", "", domain) return json.dumps(result).encode('utf-8')
def summarise_one(url, title=True, keywords=True, summary=False, \ top_img_src=False): ''' Get url and return summary ''' article = Article(url) # configuration for Newspaper to minimize processing time configure = Config() configure.fetch_images = False configure.MAX_SUMMARY = 300 configure.MAX_SUMMARY_SENT = 3 try: article.download() article.parse() except: print(url) title = article.title if keywords or summary: try: article.nlp() if keywords: keywords = article.keywords if summary: summary = article.summary except : print('NEwspaper error with nlp() call') if top_img_src: top_img_src = article.top_image return title, keywords, summary, top_img_src
def is_valid_article(link): print("Checking valid:\n" + link) if "cnn.com" not in link: return False if "html" not in link: return False article = Article(link) article.download() article.parse() article.nlp() keywords = article.keywords matched = False for key in keywords: if key in nc_set: matched = True for key in keywords: if key in contorversial_set: matched = False if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)): main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n") visited_articles.write(link+"\n") return True return False
def show_article(): url_to_clean = request.args.get('url_to_clean') if not url_to_clean: return redirect(url_for('index')) article = Article(url_to_clean) article.download() article.parse() try: html_string = ElementTree.tostring(article.clean_top_node) except: html_string = "Error converting html to string." try: article.nlp() except: log.error("Couldn't process with NLP") a = { 'html': html_string, 'authors': str(', '.join(article.authors)), 'title': article.title, 'text': article.text, 'top_image': article.top_image, 'videos': str(', '.join(article.movies)), 'keywords': str(', '.join(article.keywords)), 'summary': article.summary } return render_template('article/index.html', article=a, url=url_to_clean)
def post_new(request): if request.method == "POST": form = PostForm(request.POST) if form.is_valid(): post = form.save(commit=False) post.author = request.user post.published_date = timezone.now() post.save() return redirect('blog.views.post_detail', pk=post.pk) elif request.method == 'GET': url = request.GET.get('url', '') if len(url) > 5: article = Article(url, language='en') article.download() article.parse() article.nlp() image = article.top_image summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'") title = article.title.replace(u'\u2019',"\'") source = url.split('//')[1].split('/')[0].replace('www.','') status = 'UD' form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) else: form = PostForm() return render(request, 'blog/post_edit.html', {'form': form})
def get_nlp_data(url): article = Article(url) article.download() article.parse() article.nlp() return json.dumps(article.keywords)
def get_document_json(url): """ Parameters ------------- url: str url of the document to be parsed. Returns ------------- dict: document data. """ article = Article(url) article.download() article.parse() article.nlp() if article.publish_date is None or isinstance(article.publish_date, str): date = None else: date = article.publish_date.strftime('%Y-%m-%d') if article.meta_lang != None and article.meta_lang != '': stopwords = safe_get_stop_words(article.meta_lang) keywords = [i for i in article.keywords if i not in stopwords] else: keywords = article.keywords keywords = list(set([slugify(i) for i in keywords])) json = { 'title': article.title, 'authors': article.authors, 'created_on': date, 'language': article.meta_lang, 'keywords': keywords, 'url': url, } return json
def extract_summary(article_url): #article_url = raw_input('Please enter the url of the newsarticle \n') article_obj = Article(article_url) article_obj.download() article_obj.parse() article_obj.nlp() article_summary = article_obj.summary return article_summary
def extract_keywords(article_url): #article_url = raw_input('Please enter the url of the newsarticle \n') article_obj = Article(article_url) article_obj.download() article_obj.parse() article_obj.nlp() article_keywords = article_obj.keywords return article_keywords
def main(): source="The Guardian" #config = Config() #config.memoize_articles = False guardian = Source("http://www.theguardian.com/world", memoize_articles=False) guardian.build() #guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False) #news_pool.set([guardian], threads_per_source=2) #news_pool.join() #print(guardian.size()) for article in [x for x in guardian.articles if re.match(".*/world/.*", x.url) is not None]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue html = a.html summary = a.summary keywords = a.keywords title = a.title text = a.text date = str(a.publish_date).split()[0].split("-") date[0], date[1], date[2] = date[1], date[2], date[0] date = "/".join(date) delta = re.search(r'<span class="content__dateline-time">(.*)</span>' , html).group(1).replace(".",":").split()[0] time = datetime.now() + timedelta(hours=delta ) date_time = date + " " + time #print(title) #print(date_time) date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M') #print(date_obj.strftime('%Y/%m/%d %I:%M %p')) #TODO: Add stuff to the DB try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_obj } newspaper_article('The Guardian', article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def get_article_info(memento_url, dt, uri_id, base_dir): print memento_url article = Article(memento_url) html = get_uri_offline_data(dt, uri_id, "html", base_dir) article.download(html) article.parse() text = get_uri_offline_data(dt, uri_id, "txt", base_dir) if text != None: article.text = text article.nlp() return article
def parseURL(url): a = Article(url) try: a.download() a.parse() a.nlp() authors = a.authors keywords = a.keywords del(a) return (authors, keywords) except: return (None, None)
def fetch_article(url): print 'fetch '+url a = Article(url=url, keep_article_html=True) a.download() try: a.parse() except Exception: exc = traceback.format_exc() print "Parse error: " + exc # newspaper gives us some news stuff text = a.article_html title = a.title image = a.top_image movies = a.movies authors = a.authors article_data = { "url": url, "title": title, "text": text } if authors: article_data["author"] = { "name": authors[0] } # media if movies: article_data["media"] = { "url": movies[0], "type": "video" } elif image: article_data["media"] = { "url": image, "type": "image" } try: a.nlp(); except Exception: exc = traceback.format_exc() print "NLP error: " + exc if a.summary: article_data["summary"] = a.summary return article_data
def quick_analyse(url): fields = ['authors', 'publish_date', 'top_image', 'movies', 'keywords', 'summary'] article = Article(url) article.download() article.parse() article.nlp() print(len(article.html)) for f in fields: print(f + ': ' + str(getattr(article, f))) return article
def read_newspaper(): url = request.args.get('url', '') if url: a = Article(url, image_dimension_ration=3, keep_article_html=True) a.download() a.parse() a.nlp() json_string = json.dumps( dict(top_image=a.top_image, text=a.article_html, title=a.title, summary=a.summary, images=a.images, movies=a.movies), ensure_ascii=False, indent=None if request.is_xhr else 2) return Response(json_string, mimetype='application/json') return Response()
def fetch_article(url): article = Article(url) article.download() article.parse() article.nlp() print(article.keywords) with app.app_context(): entity = ArticleEntity.create( publish_date=article.publish_date, title=article.title, text=article.text, )
def whoHasTimeToRead(url): is_article = valid_url(url, verbose=True) config = Config() config.MAX_KEYWORDS = 10 if is_article: sumitup = {} b = Article(url=url,config=config) b.download() b.parse() b.nlp() sumNews = summary(b.title, b.text, b.keywords) sumTitle = b.title movies = b.movies[0] if len(b.movies) > 0 else "None" return sumNews,sumTitle,movies return "Nope"
def getrelevance(url,keywords): a=Article(url) a.download() a.parse() a.nlp() print a.title print a.summary l1=a.keywords no=getsimilar(l1,keywords) print keywords print l1 print "Similar words: " print no print len(keywords) print "Match: " print float(no)/(len(keywords))
def train(urls,keywords): for url in urls: a=Article(url) a.download() a.parse() a.nlp() print a.title print a.summary print a.keywords l1=a.keywords l1_1=[] for word in l1: l1_1.append(unicodedata.normalize('NFKD', word).encode('ascii','ignore')) diff=difference(l1_1,keywords) keywords.extend(diff) return keywords
def __init__(self, url): c = Config() c.keep_article_html = True article = Article(url=url, config=c) article.download() article.parse() try: article.nlp() summary = article.summary if summary == "": self.summary = "Summary not available!" else: self.summary = summary except Exception, e: self.summary = "Summary not available!"
def _parse_article(self, key, url): a = Article('') html = Google().cache(url) a.set_html(html) a.parse() a.nlp() article = {"summary":a.summary, "publish_date":a.publish_date, "images":a.images, "top_image":a.top_image, "title":a.title, "authors":a.authors, "keywords":a.keywords, "text":a.text} # update #conn = r.connect(db="clearspark") conn = r.connect(**rethink_conn.conn())
def home(url): data = {} data['url'] = url # Validate url if urlparse.urlparse(url).scheme not in ('http', 'https'): data['error'] = 'Invalid URL' return json.dumps(data) a = Article(url) a.download() a.parse() data['title'] = a.title data['authors'] = a.authors data['text'] = a.text try: a.nlp() except UnicodeDecodeError: # Strip non-ascii characters a.title = to_ascii(a.title) a.text = to_ascii(a.text) a.nlp() # NLP data['summary'] = a.summary data['keywords'] = a.keywords data['tags'] = list(a.tags) # Media data['top_image'] = a.top_image data['images'] = a.images data['movies'] = a.movies # Meta data['source_url'] = a.source_url data['published_date'] = a.published_date data['meta_img'] = a.meta_img data['meta_keywords'] = a.meta_keywords data['meta_lang'] = a.meta_lang return json.dumps(data)
def scrap(index): base_url = "https://www.google.co.in/search?q=chennai%20accidents&tbm=nws&start=" + str(index) web_page = requests.get(base_url) parsed_content = PyQuery(web_page.text) all_crimes = parsed_content("a") for crime in all_crimes: crime_url = crime.attrib["href"] if "/url?q=" in crime_url: try: article = Article((crime_url.split(start))[1].split(end)[0]) article.download() article.parse() article.nlp() keywords = article.keywords area_name = findLocation(keywords) final1.append(area_name) except Exception: pass
def parse_article(url): article = Article(url) article.download() try: article.parse() article.nlp() except ArticleException: # TODO: log the error return None else: return { 'url': article.url, 'title': article.title, 'keywords': article.keywords, 'summary': article.summary, 'images': article.images, 'movies': article.movies }
def fetch_data(bbc): bbc.build() for article in [x for x in bbc.articles]: url = article.url a = Article(url, language='en') a.download() for i in range(10): if a.is_downloaded: break else: a.download() try: a.parse() a.nlp() except: print("Error: Not parsed/downloaded correctly.") continue a.parse() a.nlp() html = a.html summary = a.summary keywords = a.keywords title = a.title print title text = a.text #date = str(a.publish_date).split()[0].split("-") #date[0], date[1], date[2] = date[1], date[2], date[0] #date = "/".join(date) #time = re.search(r'<span class="date date--v2 relative-time">(.*)<\/span>' , html).group(1).replace(".",":").split()[0] #bbc does not have a time div in html date_time = datetime.now() try: article = { 'headline': title, 'url': url, 'text': text, 'date': date_time } newspaper_article('BBC', article, keywords=keywords) except Exception as ex: print 'Article could not be created due to following error' print ex
def Test(topic, url): print("Inside hello method") print topic # url = 'http://www.theguardian.com/technology/live/2015/mar/09/apple-watch-macbook-launch-event-smartwatch-spring-forward' # url = 'http:~~www.theguardian.com~technology~live~2015~mar~09~apple-watch-macbook-launch-event-smartwatch-spring-forward' url = url.replace('~', '/') article = Article(url) article.download() article.parse() text = article.text article.nlp() art_keywords = article.keywords art_summary = article.summary # print text print art_summary # print art_keywords get_tweets(topic, art_keywords) # return ''.join(art_keywords) return json.dumps(art_keywords)
def get_article_array(url_array): arr = [] for url in url_array: try: response = session.get(url, timeout=10) print("RESPONSE CODE: " + str(response.status_code)) if response.ok: article = Article(url) article.download() article.parse() article.nlp() text = article.text arr.append(strip_unicode(text)) else: print "error" except requests.HTTPError or requests.ConnectionError as e: print(e) return arr
def post(self, request, *args, **kwargs): url = request.POST.get("url") context = {} a = Article(url, language='en') a.download() a.parse() context["title"] = a.title context["text"] = a.text context["authors"] = ", ".join(a.authors) context["top_image"] = a.top_image a.fetch_images() context["images"] = a.images context["publish_date"] = a.publish_date context["movies"] = a.movies a.nlp() context["keywords"] = ", ".join(a.keywords) context["summary"] = a.summary context["url"] = url context["method"] = "post" return render(request, self.template_name, context)
def set_slyp(self): a = Article(self.raw_url) a.download() a.parse() a.nlp() self.url = a.url.split('?')[0] if 'youtube' not in a.site_name else a.url self.raw_url = a.url self.slyp_type = 'video' if a.is_video() else 'article' self.title = a.title self.author = a.author self.date = a.publish_date self.text = a.text self.summary = a.summary self.description = a.description self.top_image = a.top_image self.site_name = a.site_name self.has_video = a.has_video() self.video_url = a.video_url self.keywords = a.keywords
def grab_news_from_RSS(inputpath, outpath, opn='write'): all_links = [] jlines = [] with open(inputpath, 'r') as filer: lines = filer.readlines() lines = [line.strip() for line in lines] # for line in lines: # temp_list = get_links_from_rss_feed(line) # if temp_list != []: # all_links.append(temp_list) # print("Done Parsing XML_Files") with jsonlines.open(outpath, 'w') as op: for i_1, rss_link in enumerate(lines): print('processing RSS link ', i_1) list_al = get_links_from_rss_feed(rss_link) for i, link in enumerate(list_al): # print(link) try: t1 = time.time() article = Article(link) article = timeout_setter(article, 100) article.download() article.parse() article.nlp() temp_dict = {} temp_dict["authors"] = article.authors if article.publish_date: temp_dict[ "publish_date"] = article.publish_date.strftime( "%m/%d/%Y, %H:%M:%S") else: temp_dict['publish_date'] = 'nil' temp_dict["text"] = article.text temp_dict["keywords"] = article.keywords temp_dict["summary"] = article.summary temp_dict['url'] = article.url temp_dict['rss_link'] = rss_link temp_dict['title'] = article.title t2 = time.time() if article.title != '': if (detect(article.title) != 'en'): continue ## or whatever thing you wish to do in this case if article.text != '': if (detect(article.text) != 'en'): continue ## or whatever thing you wish to do in this case if (i % 50 == 0): print('time taken', str(t2 - t1)) if (opn == 'write'): op.write(temp_dict) else: jlines.append(temp_dict) except Exception as E: print(E) print(traceback.format_exc()) pass if (opn != 'write'): return jlines
r = requests.get(url) soup = BeautifulSoup(r.content, 'html5lib') table = soup.findAll('a', attrs={'class': 'w_img'}) news = [] for row in table: if not row['href'].startswith('http'): news.append('https://timesofindia.indiatimes.com' + row['href']) import nltk nltk.download('punkt') df = [] for i in news: article = Article(i, language="en") article.download() article.parse() article.nlp() data = {} data['Title'] = article.title data['Text'] = article.text data['Summary'] = article.summary data['Keywords'] = article.keywords df.append(data) dataset = pd.DataFrame(df) dataset.head() FILEPATH = r"C:\Users\pavan\Downloads\crawl.csv" def TrainTestSplit(X, Y, R=0, test_size=0.2): return train_test_split(X, Y, test_size=test_size, random_state=R)
from newspaper import Article #A news article from this website url = "http://www.thehindu.com/opinion/lead/entering-the-age-of-gst/article19189469.ece" #For different language newspaper refer above table news_article = Article(url, language="en") # en for English #To download the article news_article.download() #To parse the article news_article.parse() #To perform natural language processing ie..nlp news_article.nlp() #To extract title print("Article's Title:") print(news_article.title) print("\n") #To extract text print("Article's Text:") print(news_article.text) print("\n") #To extract summary print("Article's Summary:") print(news_article.summary) print("\n")
def newspaper_parser(self, sleep_time=0): print 'running newspaper_parser()...' results = [] count = 0 profile = webdriver.FirefoxProfile() browser = webdriver.Firefox(profile) credential_names = self.credentials.keys() browser.get(self.login_url) cred1 = browser.find_element_by_id(credential_names[0]) cred2 = browser.find_element_by_id(credential_names[1]) cred1.send_keys(self.credentials[credential_names[0]]) cred2.send_keys(self.credentials[credential_names[1]]) browser.find_element_by_id(self.submit_id).click() time.sleep(15) cookies = browser.get_cookies() browser.close() s = requests.Session() for cookie in cookies: s.cookies.set(cookie['name'], cookie['value']) for l in self.links: page = s.get(l) soup = BeautifulSoup(page.content) article = Article(url=l) article.set_html(str(soup)) try: article.parse() article.nlp() except: time.sleep(60) continue data = { 'title': article.title, 'date_published': article.publish_date, 'news_outlet': self.newspaper, 'authors': article.authors, 'feature_img': article.top_image, 'article_link': article.canonical_link, 'keywords': article.keywords, 'movies': article.movies, 'summary': article.summary, 'text': article.text, 'html': article.html } print data['title'] print data['text'] print print results.append(data) time.sleep(sleep_time) count += 1 print count return results
def select_scraper_zero_tasks(): # database = "/home/zihua/macury/MercuryChallenge/scraper/articles.db" database = "/home/zihua/macury/MercuryChallenge/jsonToSql/mercury.db" #connect to the database con = sqlite3.connect(database) # counter = 0 cur = con.cursor() #select event id and url from the sqlite table cur.execute( "select event_id, first_reported_link from cu_gsr_event group by first_reported_link having count(event_id)>0 order by count(first_reported_link) desc;" ) #fetch the sql query with all search results rows = cur.fetchall() print(len(rows)) # for each event id and url under the search query for event_id, url in rows: scraper = cfscrape.create_scraper(url) # print(event_id,', ', url) try: # scrape the url content gold = scraper.get(url).content print("URL scraped") except: # print the URL that was failed to visit print("Failed at ", url) continue #article function to parse the arabic webpage, especially in Arabic article = Article(url, memoize_articles=False, language='ar') #download html page by using scraper library try: article.download(input_html=gold) #This is another bug from the source code. Hope this solve the problem. #The reason behind this is some website has "deformed" format, and it takes a period of time to visit it. if article.download_state != 2: # ArticleDownloadState.SUCCESS is 2 time.sleep(1) article.parse() else: #parse the html page article.parse() #add sleep time here # time.sleep(1) #apply article nlp function article.nlp() #event id is given eventid = event_id #Troublemaker is Here! Article gives this value as a list, #but Sqlite Table does not take list as text value author = article.authors #I use join function to combine the list to be a string authors = " ".join(str(x) for x in author) #decide not to add current parsing time. #the publish date function gets publish date #depend upon the capability of this function to get publish date publish_date = article.publish_date # date = article.publish_date.date() #it gets title if possible title = article.title #it gets article main content content = article.text #Again, another troublemaker here! It works after combine list to be a string keyword = article.keywords if keyword is None: keywords = keyword else: keywords = ' '.join(str(e) for e in keyword) #Article summary, but assume it's same as text, which should be the main content summary = article.summary if title: print(title) # print(keywords) elif keywords: print("K Perceived") elif summary: print("S Perceived") elif content: print("C Perceived") else: print("Skipped, No Title or Any Other Needed Info") continue try: con2 = sqlite3.connect(database) # con = sqlite3.connect(database) with con2: cur2 = con2.cursor() #insert all attributes to the sqlite table cur2.execute( 'INSERT INTO article_info (Event_ID, Authors, Publish_Date, Content, Keywords, Summary, Title) VALUES (?,?,?,?,?,?,?)', (eventid, authors, publish_date, content, keywords, summary, title)) # call commit on the connection... con2.commit() except Error as e: print(event_id + " not successful. Error: " + database) pass except: print("This File May Not Been Downloaded!") pass
links = [] for i in articles: if i.find('a')['href'] == '#': continue else: links.append(i.find('a')['href']) all_row_list = [] i = 1 for link in links[:25]: print("Scraping: " + link) news = Article(link) news.download() try: news.parse() news.nlp() kategori = link.split('/')[3] row_list = [ i, 'detik_inet', news.publish_date, news.title, news.text.replace("\n", ""), kategori ] i += 1 all_row_list.append(row_list) except Exception: pass writeToCsv(all_row_list)
def home_page(): # Scrape and parse textual content from web resource. This method employs Article from Newspaper3k library to download and parse html from the web resource. It uses heuristics to scrape main body of visible text. # :param url: Uniform Resource Locator. # :return: Scraped content of web resource. user_input = st.text_input('Enter URL of an article or text') with open(get_data_path('fake_news_sites.json')) as json_file: fake_news_db_news = json.load(json_file) with open(get_data_path('categories.json')) as json_file: categories = json.load(json_file) with open(get_data_path('opensources/sources.json')) as json_file: open_source_json = json.load(json_file) try: # Get domain name from the url domain_name = get_domain(user_input) # Get formated domain formated_domain = format_url(domain_name) except Exception: st.warning("Enter an URL to suppress the warning !!") try: my_article = Article(user_input, language="en", keep_article_html=True) my_article.download() slept = 0 while my_article.download_state == ArticleDownloadState.NOT_STARTED: # Raise exception if article download state does not change after 10 seconds if slept > 9: raise ArticleException('Download never started') sleep(1) slept += 1 my_article.parse() except Exception as ec: print(ec) if st.button('Check authenticity'): st.header("VirusTotal - Malicious URL Scanner (virustotal.com)") st.markdown('''---''') with st.spinner(text="Fetching measures - Analysis in progress"): # task = asyncio.create_task(scan_url(user_input)) # json_data = await task json_data = scan_url(user_input=user_input) if json_data is not None: category_key = list(json_data.keys()) category_value = [json_data[i]['result'] for i in category_key] left, center, right = st.beta_columns((1, 2, 1)) with left: left.markdown('''**No.** ''', unsafe_allow_html=True) for i in range(1, 21): left.write(i) with center: center.markdown('''**Detected by**''', unsafe_allow_html=True) for i in category_key[:20]: center.write(i) with right: right.markdown('''**Result**''', unsafe_allow_html=True) for link in category_value[:20]: if link == 'clean': right.markdown( f'<span style="color:green">clean site</span>', unsafe_allow_html=True) else: right.markdown( f'<span style="color:red">{link}</span>', unsafe_allow_html=True) else: st.warning( "Couldn't able to get detect the site or Invalid URL provided !!" ) st.header("News site authencity") st.markdown('''---''') left, right = st.beta_columns((1, 2)) res = get_opensource_news(domain_name, formated_domain, open_source_json) left.markdown('''**Source** : OpenSource http://www.opensources.co/''', unsafe_allow_html=True) right.markdown(f'**Checking Domain** : {domain_name}', unsafe_allow_html=True) if res is None: right.warning("URL is not found in OpenSource Database") else: right.markdown(f'**Category** : {res["type"]}', unsafe_allow_html=True) try: right.markdown(f'**Discription** : {categories[res["type"]]}', unsafe_allow_html=True) except: right.warning("Category Discription isn't available !!") if res["Source Notes (things to know?)"]: right.markdown( f'**Source Notes (things to know?)** : {res["Source Notes (things to know?)"]}', unsafe_allow_html=True) st.markdown('''---''') left1, right1 = st.beta_columns((1, 2)) res1 = get_fb_news_data(domain_name, formated_domain, fake_news_db_news) left1.markdown('''**Source** : FakeNews Site DB''', unsafe_allow_html=True) right1.markdown(f'**Checking Domain** : {domain_name}', unsafe_allow_html=True) if res1 is None: right1.warning("URL is not found in Fake news site database") else: try: right1.markdown(f'**Category** : {res1["siteCategory"]}', unsafe_allow_html=True) right1.markdown(f'**Site name** : {res1["siteTitle"]}', unsafe_allow_html=True) if type(res1["siteCategory"]) is list: right1.markdown( f'**Discription** : {categories[res1["siteCategory"][0]]}', unsafe_allow_html=True) else: right1.markdown( f'**Discription** : {categories[res1["siteCategory"]]}', unsafe_allow_html=True) if res1["siteNotes"]: right1.markdown( f'**Source Notes (things to know?)** : {res1["siteNotes"]}', unsafe_allow_html=True) except Exception: st.warning("Category is not available for this site !!") if res1["siteCategory"] == 'reliable': st.success( "This is a trusted news site, which means the claim and article published on this site is transparent, authentic, trustworthy, complete, and in the absence of biases, it also protects audiences and users from disinformation." ) else: st.error( "This news site is not reliable or not authentic, the information published by this site might not be true !!" ) st.markdown('''### **Article Title**''') # st.header(Article Title) title = my_article.title if title: st.markdown(f'{title}') else: st.warning( "Coudn\'t able extract the title or Invalid URL Provided") st.markdown('''### **Article Authors **''') author = my_article.authors if len(author) != 0: # st.markdown(f'{author}') st.markdown( f'<span style="background-color:#00C4EB;border-radius:5px;box-shadow: 0 5px 0 rgb(0, 116, 191);color: #FFFFFF;padding: 0.5em 1em;position: relative;text-decoration: none;font-weight:bold;cursor: pointer;">{author[0]}</span>', unsafe_allow_html=True) else: st.warning( "Coudn\'t able extract the author name or Invalid URL Provided" ) st.markdown('''### **Publish Date**''') date = my_article.publish_date if date: st.info(f'{date} ') else: st.warning( "Coudn\'t able extract the publish date or Invalid URL Provided" ) st.markdown('''### **Image**''') image_url = my_article.top_image if image_url: st.image(image_url, caption="Article Top Image") st.markdown( f'''<p align="center"><b> Source URL : <b><a href="{ image_url }">{ image_url }</a></p>''', unsafe_allow_html=True) else: st.warning( "Coudn\'t able extract the Image or Invalid URL Provided or No image is present" ) st.markdown('''### **Article Text**''') article_text = my_article.text if article_text: with st.beta_expander( "🧙 Click here for more info about the article 🔮"): st.markdown(f'{article_text}', unsafe_allow_html=True) else: st.warning( "Coudn\'t able extract the publish article or Invalid URL Provided" ) st.markdown('''### **Movies / Videos**''') videos = my_article.movies if videos: st.video(videos[0]) else: st.warning( "Coudn\'t able extract the publish videos or No videos were published or Invalid URL Provided " ) try: my_article.nlp() except Exception as ec: st.error(ec) # except ArticleException: # st.error("Article Exception Occured !!") st.markdown('''### **Keywords (NLP)**''') nlp_keywords = my_article.keywords if nlp_keywords: st.info(nlp_keywords) else: st.warning( "Coudn\'t able to get the top keywords or Invalid URL Provided" ) st.markdown('''### **Summary (NLP)**''') nlp_summary = my_article.summary if nlp_summary: st.markdown(f'{nlp_summary}', unsafe_allow_html=True) else: st.warning( "Coudn\'t able to get the summary of the article or Invalid URL Provided" ) st.header("News article veracity") st.markdown('''---''') if article_text is not None: with st.spinner(text="Inference is in Progress ⏳ ..."): output_label = asyncio.run( model_service.predict_from_server(article_text)) # left,right = st.beta_columns((1,2)) st.markdown( '''**Analysis based on:** : Artificial intelligence''') st.markdown( '''**Notes:** WARNING: This result may be inaccurate! This domain wasn't categorised on any human maintained list thus analysis was performed by machine learning model.''' ) if output_label: st.markdown(f'Predicted label : {output_label}', unsafe_allow_html=True) st.success("Real news") else: st.markdown(f'Predicted label : {output_label}', unsafe_allow_html=True) st.error("Fake news") st.balloons() else: st.warning( "Article text is not found, hence news article veracity analysis is incomplete !!" )
article.download() #To parse the article article.parse() #To extract title print("Article's Title:") print(article.title) print("\n") #To extract text print("Article's Text:") print(article.text) print("\n") article.nlp() #To extract summary print("Article's Summary:") print(article.summary) print("\n") #To extract keywords print("Article's Keywords:") print(article.keywords) #Chinese newspaper from newspaper import Article #News article from the sinchew (Chinese Newspaper) url = 'https://www.sinchew.com.my/content/content_2058387.html'
import nltk from newspaper import Article import sys import os.path from os import path #Get the article url = str(sys.argv[1]) mFilePath = str(sys.argv[2]) article = Article(url) # Do some NLP article.download() #Downloads the link’s HTML content article.parse() #Parse the article nltk.download('punkt') #1 time download of the sentence tokenizer article.nlp() # Keyword extraction wrapper text = article.summary print(text) obj = TextBlob(text) #returns the sentiment of text #by returning a value between -1.0 and 1.0 sentiment = obj.sentiment.polarity if path.exists(mFilePath): Html_file = open(mFilePath, "a", newline='') Html_file.write('\n') Html_file.write(str(sentiment)) Html_file.close() else:
date, time = time.split('T') time = time[:-1] time = f"{date} {time} +0000" time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S %z') time = time.astimezone(pytz.timezone('Asia/Kolkata')) # Create link link = item.a['href'] link = ''.join(["https://news.google.com", link[1:]]) # Use Article from newspaper try: art = Article(link, language="en") art.download() art.parse() art.nlp() # Create main data data.append(f"Article's Date :- {time.date()} {time.time()}") data.append(f"Article's Title :- {title.text}") data.append(f"Link for article :- {link}") data.append(f"Summary :- \n{art.summary}") # Write to main news csv main_writer.writerow( [f"{time.date()} {time.time()}", title.text, art.summary, link]) main_data.append('\n\n'.join(data)) except: print( f"\nLink to \"{title.text}\" does not work or there is a connection error"
penalties[data] = ( abs(url_data[data][4] - ultimateMeanVibe) ) * meanVibeWeight # data piece will be anywhere from 0 to 18 penalty, where it normally does not exceed 9 date_time_written = datetime.datetime.strptime(dates[urls.index(data)], '%Y-%m-%d %H:%M:%S') deltadatetime = date_time_now - date_time_written penalties[data] += (deltadatetime.days / 14) * numDaysWeight penalties[data] += max( -.5, -((url_data[data][2] / url_data[data][0]) * quoteWeight)) keywordsInTitle = 0 titleWords = titles[urls.index(data)].split() a = Article(data) a.download() a.parse() a.nlp() for keyword in a.keywords: for word in titleWords: if (keyword == word): keywordsInTitle += 1 penalties[data] += -(keywordsInTitle * titleWordWeight) newDic = {k: v for k, v in sorted(penalties.items(), key=lambda item: item[1])} newerDic = {} newerDicVals = 0 for key in newDic: if (newerDicVals > 49): break newerDic[key] = newDic[key] newerDicVals += 1
def scrape(query, month=""): last_day = 31 time = "" if (month[-2:] == '02'): last_day = 28 elif (month[-2:] in ['04', '06', '09', '11']): last_day = 30 if (month != ""): time = "after:" + month + "-01" + " AND before:" + month + "-" + str( last_day) #last_day > 10 so no date formatting issues news = "https://news.google.com" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } page = requests.get("https://news.google.com/search?q=" + query + " " + time, headers=headers) soup = BeautifulSoup(page.content, features="lxml") found = soup.findAll("a") links = [x.get('href') for x in found] temp = [] redirects = [] articles = [] count = 0 for l in links: if (type(l) == str): if 'article' in l: if l not in redirects: redirects.append(l) count += 1 redirects = [x[1:] for x in redirects] for a in redirects: try: r = requests.get(news + a, timeout=10) articles.append(r.url) except Exception as e: print(e) user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent a = None data = [] for url in articles: try: a = Article(url, config=config) a.download() a.parse() a.nlp() d = { "url": url, "title": a.title, "authors": a.authors, "date": a.publish_date.date(), "keywords": a.keywords, "summary": a.summary, #"text": a.text } data.append(d) #file.write(url + ";" + a.title + ";" + a.authors + ";" + a.publish_date.date() + ";" + a.summary.encode('unicode-escape') + ";" + a.text.encode('unicode-escape')) #print(a.title, a.authors, a.publish_date.date(), a.keywords) #print(a.summary) #print('*'*100) except Exception as e: print("Error with", url) print(a.title) print(e) print('*' * 100) return data
import csv #A new article from TOI url = "https://www.programiz.com/python-programming/working-csv-files" #For different language newspaper refer above table toi_article = Article(url, language="en") # en for English #To download the article toi_article.download() #To parse the article toi_article.parse() #To perform natural language processing ie..nlp toi_article.nlp() #To extract title print("Article's Title:") p = toi_article.text print(p) print("nn") #To extract text print("Article's Text:") c = toi_article.text print(c) print("nn") #To extract summary print("Article's Summary:")
def thevergeArticles(i, queue, debug, minDelay, maxDelay, b): while True: try: url = queue.get() except: time.sleep(1) continue if url is None: break sys.exit(1) if debug: sys.stdout.write('Visting publication url :: ' + url + '\n' + '\n') logging.info('Visting publication url :: ' + url + '\n' + '\n') article = Article(url) article.download() article.parse() try: authors = article.authors except: authors = '' try: publish_date = str( article.publish_date.replace(second=0).isoformat().replace( ':00+00:00', '+00:00')) publish_date = article.publish_date except: publish_date = '' try: date = str( datetime(publish_date.year, publish_date.month, publish_date.day, tzinfo=TZ()).isoformat()).replace( ':00+00:00', '+00:00') except Exception, e: print str(e) date = '' print date try: text = article.text except: text = '' try: top_image = article.top_image except: top_image = '' try: movies = article.movies except: movies = '' article.nlp() try: keywords = article.keywords except: keywords = '' try: summary = article.summary except: summary = '' if summary == '': summary = trimArticle(text, 50) images = {} try: all_images = article.images if len(all_images) > 0: for i in range(len(all_images)): images['image_' + str(i)] = all_images[i] except: pass try: abstract = article.meta_description except: abstract = '' try: title = article.title except: title = '' # write the fellow summary to file file_name = 'theverge_' + title.replace(' ', '-') + '.json' file_name = ''.join(c for c in file_name if c in valid_chars) if os.name == 'nt': f = open('success//' + file_name, 'wb') else: f = open('success/' + file_name, 'wb') folder = 'success' logging.info('Opened ' + 'success//' + file_name + '.json' + ' for writing') data = { 'abstract': summary, 'external_id': 'theverge_' + title.replace(' ', '-'), 'date': date, 'url': url, 'title': title, 'words': text, 'meta': { 'theverge': { 'keywords': str(keywords), 'top_image': top_image, 'authors': authors, 'authors': authors, 'allImages': str(images) } } } f.write(json.dumps(data)) f.close() logging.info('File written ' + file_name) if os.name == 'nt': uploadDataS3(folder + '//' + file_name, b) else: uploadDataS3(folder + '/' + file_name, b) if debug: sys.stdout.write(file_name + ' has been written to S3 bucket' + '\n') logging.info(file_name + ' has been written to S3 bucket' + '\n') if debug: sys.stdout.write(file_name + ' written' + '\n') wait_time = random.randint(minDelay, maxDelay) sys.stdout.write('Sleeping for :: ' + str(wait_time) + '\n') logging.info('Sleeping for :: ' + str(wait_time) + '\n') sys.stdout.write('******************************************' + '\n') sys.stdout.write('******************************************' + '\n') time.sleep(wait_time)
def scrape_reddit(reddit, engine, limit_, yest): try: i = 0 for submission in reddit.subreddit('news').hot(limit=limit_): if (submission.created > yest): query_comments = '''SELECT EXISTS(SELECT * FROM MemeNews.every_comment WHERE post_id LIKE '{0}' LIMIT 1)'''.format( submission.id) query_articles = '''SELECT EXISTS(SELECT * FROM MemeNews.Daily_Articles WHERE id LIKE '{0}' LIMIT 1)'''.format( submission.id) if (engine.execute(query_articles).fetchone()[0]): continue submission.comment_sort = 'best' article = Article(submission.url) try: article.download() article.parse() article.nlp() article.fetch_images() except: continue articles_dict = { "title": re.sub(r'[^\x00-\x7F]', '', submission.title.replace('"', "'")), "score": submission.score, "id": submission.id, "url": submission.url, "comms_num": submission.num_comments, "created": submission.created, "body": re.sub(r'[^\x00-\x7F]', '', article.text.replace('"', "'")), "image": article.top_image, "keywords": ', '.join(article.keywords).replace('"', "'"), "summary": re.sub(r'[^\x00-\x7F]', '', article.summary.replace('"', "'")) } #add articles articles_data = pd.DataFrame(articles_dict, index=[i]) articles_data.to_sql('Daily_Articles', con=engine, if_exists='append', dtype={'None': VARCHAR(5)}) print("article added with url: ", submission.url) if (engine.execute(query_comments).fetchone()[0]): continue comment_dict = { "post_id": [], 'post_title': [], "id": [], "author": [], "body": [], "created": [], 'score': [], 'is_submitter': [], 'parent_id': [] } for top_level_comment in submission.comments.list()[:100]: try: comment_dict['is_submitter'].append( top_level_comment.is_submitter) comment_dict['post_id'].append(submission.id) comment_dict['id'].append(top_level_comment.id) comment_dict['author'].append(top_level_comment.author) comment_dict['body'].append( re.sub(r'[^\x00-\x7F]', '', top_level_comment.body)) comment_dict['score'].append(top_level_comment.score) comment_dict['created'].append( top_level_comment.created_utc) comment_dict['parent_id'].append( top_level_comment.parent_id) comment_dict['post_title'].append(submission.title) except: continue comment_data = pd.DataFrame(comment_dict) comment_data.to_sql('every_comment', con=engine, if_exists='append', dtype={'None': VARCHAR(5)}) print("comments added") i += 1 return 1 except err: print(err) return 0
def handle(self, url, website, *args, **options): config = Config() config.browser_user_agent = user_agent if website: links = pagelinks.objects.filter( Q(fetched=False) & Q(site__name=website[0])).values_list( 'url', flat=True) elif url: links = pagelinks.objects.filter(Q(url=url[0])).values_list( 'url', flat=True) else: links = pagelinks.objects.filter(fetched=False).values_list( 'url', flat=True) l = len(links) j = 0 print(str(l) + ' link(s)') printProgressBar(0, l, prefix='Progress: ', suffix='Complete', length=100) for link in links: print(str(link) + '\n') print(str(l - j) + ' remaining..') try: an_article = Article(url=link, config=config) an_article.download() an_article.parse() an_article.nlp() if len(an_article.text) > 999: pagelinks.objects.filter(url=link).update( fetched=True, body=an_article.text, publish_date=an_article.publish_date, top_image=an_article.top_image, authors=an_article.authors, videos=an_article.movies, keywords=an_article.keywords, summary=an_article.summary, title=an_article.title, is_article=True) else: pagelinks.objects.filter(url=link).update( fetched=True, body=an_article.text, publish_date=an_article.publish_date, top_image=an_article.top_image, authors=an_article.authors, videos=an_article.movies, keywords=an_article.keywords, summary=an_article.summary, title=an_article.title) j += 1 printProgressBar(j + 1, l, prefix='Progress: ', suffix='Complete', length=100) except Exception as e: print(e) continue
def build_db(): fn = os.path.join(os.path.dirname('__file__'), '../panel/panel.json') db = {} db['snippets'] = [] with open(fn, "r") as json_file: panel = json.loads(json_file.read()) with open("database.json", "w") as outfile: i = 0 for key in panel: i += 1 j = 0 print 'looking at {}, {}/{} keywords'.format(key, i, len(panel)) for pundit in panel[key]: j += 1 print '\tlooking at {}, {}/{} pundits'.format(pundit['name'].encode('ascii', 'ignore'), j, len(panel[key])) if pundit['links']['brookings']: response = requests.get(pundit['links']['brookings']) soup = bs4.BeautifulSoup(response.text, "html.parser") for link in soup.select('ul.media-list li div.content h3.title a'): snippet = {} link_href = str(link.attrs.get('href')) url = "http://www.brookings.edu" + link_href try: link_response = requests.get(url) soup = bs4.BeautifulSoup(link_response.text, "html.parser") try: full_url_link = soup.select('div.article-detail em a')[0] full_url = str(full_url_link.attrs.get('href')) except IndexError: full_url = url if 'pdf' not in full_url and validate_url(full_url): article = Article(url) try: article.download() article.parse() article.nlp() print '\t\t', full_url snippet["text"] = article.text snippet["summary"] = article.summary snippet["url"] = url snippet["full_url"] = full_url snippet["keywords"] = article.keywords snippet["pundit"] = {} snippet["pundit"]["name"] = pundit["name"] snippet["pundit"]["title"] = pundit["title"] db['snippets'].append(snippet) except ArticleException(): pass except requests.exceptions.ConnectionError: pass if pundit['links']['cfr']: response = requests.get(pundit['links']['cfr'] + "#publications") soup = bs4.BeautifulSoup(response.text, "html.parser") for link in soup.select('div#publications article.publication_spotlight h3 a'): snippet = {} link_href = str(link.attrs.get('href')) try: if "http" in link_href: url = link_href full_url = link_href else: url = "http://www.cfr.org" + link_href full_url_link = soup.find(text='View full text of article') if full_url_link: full_url = str(full_url_link.parent.attrs.get('href')) else: full_url = url link_response = requests.get(url) soup = bs4.BeautifulSoup(link_response.text, "html.parser") if 'pdf' not in full_url and validate_url(full_url): article = Article(full_url) else: article = Article(url) try: article.download() article.parse() article.nlp() print '\t\t', url snippet["text"] = article.text snippet["summary"] = article.summary snippet["url"] = url snippet["full_url"] = full_url snippet["keywords"] = article.keywords snippet["pundit"] = {} snippet["pundit"]["name"] = pundit["name"] snippet["pundit"]["title"] = pundit["title"] db['snippets'].append(snippet) except ArticleException: pass except requests.exceptions.ConnectionError: pass if pundit['links']['baker']: response = requests.get(pundit['links']['baker']) soup = bs4.BeautifulSoup(response.text, "html.parser") for link in soup.select('h3#library ul li a'): snippet = {} link_href = str(link.attrs.get('href')) try: url = "http://www.bakerinstitute.org" + link_href full_url_link = soup.select('div.research_content div.researchContent span a')[-1] if full_url_link: full_url = str(full_url_link.parent.attrs.get('href')) else: full_url = url link_response = requests.get(url) soup = bs4.BeautifulSoup(link_response.text, "html.parser") if 'pdf' not in full_url and validate_url(full_url): article = Article(full_url) else: article = Article(url) try: article.download() article.parse() article.nlp() print '\t\t', url snippet["text"] = article.text snippet["summary"] = article.summary snippet["url"] = url snippet["full_url"] = full_url snippet["keywords"] = article.keywords snippet["pundit"] = {} snippet["pundit"]["name"] = pundit["name"] snippet["pundit"]["title"] = pundit["title"] db['snippets'].append(snippet) except ArticleException: pass except requests.exceptions.ConnectionError: pass if pundit['links']['ecfr']: response = requests.get(pundit['links']['ecfr']) soup = bs4.BeautifulSoup(response.text, "html.parser") for link in soup.select('ul#all li div.post div.list-content a'): snippet = {} link_href = str(link.attrs.get('href')) try: url = "http://www.ecfr.eu" + link_href full_url = url link_response = requests.get(url) soup = bs4.BeautifulSoup(link_response.text, "html.parser") if 'pdf' not in full_url and validate_url(full_url): article = Article(full_url) else: article = Article(url) try: article.download() article.parse() article.nlp() print '\t\t', url snippet["text"] = article.text snippet["summary"] = article.summary snippet["url"] = url snippet["full_url"] = full_url snippet["keywords"] = article.keywords snippet["pundit"] = {} snippet["pundit"]["name"] = pundit["name"] snippet["pundit"]["title"] = pundit["title"] db['snippets'].append(snippet) except ArticleException: pass except requests.exceptions.ConnectionError: pass json.dump(db, outfile, indent=4)
class ArticleScraper(Article): """ For a given article url, it downloads and parses some specific data and writes a JSON in the output_file """ def __init__(self, url, timestamp, newspaper): """ Initialize ArticleScraper """ self.article_obj = {} self.article_obj["url"] = url self.article_obj["newspaper"] = newspaper self.article_obj["timestamp"] = timestamp if self.article_obj: # initiate article self.article = Article(url, language="es") # parse article # self.parse_article() def parse_article(self): """ Download, Parse and NLP a given article """ try: # download source code self.article.download() # parse code self.article.parse() # populate article obj with parsed data try: self.article_obj["title"] = self.article.title # self.article_obj["title"] = self.article.title.encode("utf-8").strip() except: self.article_obj["title"] = "" try: self.article_obj["authors"] = self.article.authors except: self.article_obj["authors"] = "" try: self.article_obj["publish_date"] = self.article.publish_date # self.article_obj["publish_date"] = self.article.publish_date.encode("utf-8").strip() except: self.article_obj["publish_date"] = "" try: self.article_obj["text"] = self.article.text # self.article_obj["text"] = self.article.text.encode("utf-8").strip() except: self.article_obj["text"] = "" try: self.article_obj["top_image"] = self.article.top_image except: self.article_obj["top_image"] = "" self.article.nlp() try: self.article_obj["summary"] = self.article.summary except: self.article_obj["summary"] = "" try: self.article_obj["keywords"] = self.article.keywords except: self.article_obj["keywords"] = [] # print(self.article_obj) return self.article_obj except: pass
def process(update, context): if update.message: text = update.message.text else: return links = find(text) # handling for groups, when message has no links if not links: # and update.message.chat.type == "super_group": return link = links[0] # try: # link = links[0] # except: # update.message.reply_text("Oh! Send a valid link.") article = Article(link) article.download() article.parse() try: author = "✍ *Author:* " + article.authors + "\n" except: author = "" date = "📅 *Publication Date:* " try: date += str(article.publish_date.strftime('%Y-%m-%d')) except: if article.publish_date is None: date = "" else: date += str(article.publish_date) value = article.html tree = fromstring(value) title = str(tree.findtext('.//title')) lang = translator.detect(title).lang if lang != 'en': text = translate(link) if text == 'null': return update.message.reply_text(text) link = find(text)[0] article = Article(link) article.download() article.parse() text = article.text soup = bs(value, 'lxml') outline = "" for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): heading_text = heading.text.strip() if heading.name in ["h1", "h2"]: heading_text = f"*{heading_text}*" outline += int(heading.name[1:]) * ' ' + '- ' + heading_text + '\n' article.nlp() keywords = article.keywords tags = "" for keyword in keywords: tags += " #" + keyword summary = article.summary summary_points = "" for x in summary.splitlines(): summary_points += "↦️ " + x + "\n" summary = summary_points read = readtime.of_text(text) msg = f"""🔗 *Link:* {link}\n{author}{date}\n🚩 *Title: {title}*\n\n🗨 *Summary:*\n _{summary}_\n""" msg += f"""🎋 *Outline: * \n{outline}\n""" msg += f"""🤔 *Reading Time:* {read}\n""".replace("min", "mins") msg += f"""📑 *Tags:* {tags}\n """ query = urllib.parse.quote(msg.replace('*', '**').replace('_', '__')) share_url = 'tg://msg_url?url=' + query button_list = [ InlineKeyboardButton('Add to reading list', callback_data=1), InlineKeyboardButton("📬 Share", url=share_url) ] reply_markup = InlineKeyboardMarkup(build_menu(button_list, n_cols=2)) update.message.reply_text( msg, parse_mode=telegram.ParseMode.MARKDOWN, reply_markup=reply_markup) if update.message.chat_id != ADMIN: context.bot.send_message(chat_id="{}".format(ADMIN), text='{}'.format( update.message.from_user.first_name + " *sent:*\n" + msg), parse_mode=telegram.ParseMode.MARKDOWN)
def post(self): try: data = json.loads(self.request.body.decode('utf-8')) except Exception: data = self.get_argument('data') data = json.loads(data) action = data.get('action') if action == 'load_page': email = self.current_user.decode('utf-8') id = get_user(email) id = id[0][0] full_summarization = get_mysummary(id) full = [] for data in full_summarization: full.append({ 'title': data[0], 'link': data[1], 'photo': data[2], 'keywords': data[3], 'summary': data[4], 'date': data[5] }) full.reverse() number = get_rowsummary(id) self.write(json.dumps({'summary': full, 'number': number[0][0]})) elif action == 'summary': try: url = json.loads(self.request.body.decode('utf-8')) except Exception: url = self.get_argument('data') url = json.loads(url) try: url = url.get('url') url = re.sub(' ', '', url) email = self.current_user.decode('utf-8') article = Article(url, language='en') article.download() article.parse() title = article.title if detect(title) != 'en' or detect(article.text) != 'en': self.write( json.dumps( {'result': 'This language will be supported soon'})) else: try: image = article.top_image except Exception: image = '' article.nlp() try: keywords = article.keywords keywords = ','.join(keywords) except Exception: keywords = 'Sorry,no,keywords,found' try: summary = article.summary summary = '<p style = "margin: 10px 0px 10px 0px">' + re.sub( r'\.', r'.</p><p style = "margin: 10px 0px 10px 0px">', summary) summary = summary[:-40] except Exception: summary = 'Sorry, no summmary found' try: publish_date = article.publish_date publish_date = publish_date.date() except Exception: publish_date = 'XII b.c.' if url[-1] == '/': summarized = { 'title': title, 'link': url, 'photo': image, 'keywords': keywords, 'summary': str(summary), 'date': str(publish_date) } else: url = url + '/' summarized = { 'title': title, 'link': url, 'photo': image, 'keywords': keywords, 'summary': str(summary), 'date': str(publish_date) } id = get_user(email) id = id[0][0] result = main_summarization(summarized, id) summarized = { 'title': title, 'link': url, 'photo': image, 'keywords': keywords, 'summary': str(summary), 'date': str(publish_date) } if result == 'You have this result': jsn = {'result': result} self.write(json.dumps(jsn)) else: jsn = { 'summary': summarized, 'number': result, 'result': 'done' } self.write(json.dumps(jsn)) except Exception: jsn = {'result': 'This URL is unsummarizable'} self.write(json.dumps(jsn)) elif action == 'delete': try: url = json.loads(self.request.body.decode('utf-8')) except Exception: url = self.get_argument('data') url = json.loads(url) url = url.get('url') email = self.current_user.decode('utf-8') id = get_user(email) id = id[0][0] main_delete(id, url) self.write(json.dumps({'result': 'done'}))
def _retrieve_data(self): if self.config.dataproducer.pull_data: print("Pull data...") append_data = [] page_count = 0 with urllib.request.urlopen( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=01012021-03312021&items=50&token=' + self.config.dataproducer.apikey + '&page=1') as url: data = json.loads(url.read().decode()) page_count = int(data["total_pages"]) print(page_count) for i in range(1, page_count): print( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=01012021-03312021&items=50&token=' + self.config.dataproducer.apikey + '&page=' + str(i)) df = pd.read_json( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=01012021-03312021&items=50&token=' + self.config.dataproducer.apikey + '&page=' + str(i)) append_data.append(df) with urllib.request.urlopen( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=04012021-06302021&items=50&token=' + self.config.dataproducer.apikey + '&page=1') as url: data = json.loads(url.read().decode()) page_count = int(data["total_pages"]) print(page_count) for i in range(1, page_count): df = pd.read_json( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=04012021-06302021&items=50&token=' + self.config.dataproducer.apikey + '&page=' + str(i)) append_data.append(df) with urllib.request.urlopen( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=07012021-09312021&items=50&token=' + self.config.dataproducer.apikey + '&page=1') as url: data = json.loads(url.read().decode()) page_count = int(data["total_pages"]) print(page_count) for i in range(1, page_count): df = pd.read_json( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=07012021-09312021&items=50&token=' + self.config.dataproducer.apikey + '&page=' + str(i)) append_data.append(df) with urllib.request.urlopen( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=10012021-11052021&items=50&token=' + self.config.dataproducer.apikey + '&page=1') as url: data = json.loads(url.read().decode()) page_count = int(data["total_pages"]) print(page_count) for i in range(1, page_count): df = pd.read_json( 'https://cryptonews-api.com/api/v1?tickers=BTC&date=10012021-11052021&items=50&token=' + self.config.dataproducer.apikey + '&page=' + str(i)) append_data.append(df) df = pd.concat(append_data) df.to_pickle('./data/raw/corpus.pkl') df = pd.json_normalize(df['data']) df.rename(columns={'date': 'datetime'}, inplace=True) df['date'] = pd.to_datetime(df['datetime'], format='%a, %d %b %Y %H:%M:%S %z', utc=True) df['date'] = df['date'].dt.date.astype('datetime64') df.drop(['image_url', 'topics', 'tickers'], axis=1, inplace=True) df = df[df.type.isin(['Article'])] nltk.download('punkt') user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' config = Config() config.browser_user_agent = user_agent list = [] for ind in df.index: dict = {} #print(df_all['news_url'][ind]) article = Article(df['news_url'][ind], config=config) article.download() try: article.parse() article.nlp() dict['datetime'] = df['datetime'][ind] dict['date'] = df['date'][ind] dict['news_url'] = df['news_url'][ind] dict['title'] = df['title'][ind] dict['text'] = df['text'][ind] dict['source_name'] = df['source_name'][ind] dict['sentiment'] = df['sentiment'][ind] dict['type'] = df['type'][ind] dict['article_title'] = article.title dict['article_text'] = article.text dict['article_summary'] = article.summary list.append(dict) except: pass full_df = pd.DataFrame(list) full_df.to_pickle("./data/raw/news_corpus_110521.pkl") else: print('Read saved data...') full_df = pd.read_pickle('./data/raw/news_corpus_110521.pkl') return full_df
def parse_article(url): article = Article(url) article.download() article.parse() article.nlp() gen_article_dictionary(article)
print(article.html) article.parse() article.authors article.publish_date article.text article.top_image article.movies print(article.nlp()) article.keywords print(article.summary) import newspaper elpais = newspaper.build('http://www.elpais.com') print("number of articles in elpais") print(len(elpais.articles)) # for article in cnn_paper.articles: # print(article.url) # for category in cnn_paper.category_urls():
# In[4]: nltk.download('punkt', quiet=True) # Download the punkt package nltk.download('wordnet', quiet=True) # In[5]: #Get the article URL article = Article('https://www.medicalnewstoday.com/articles/256521') article.download() #Download the article article.parse() #Parse the article article.nlp() #Apply Natural Language Processing (NLP) corpus = article.text # In[6]: print(corpus) # In[7]: text = corpus sent_tokens = nltk.sent_tokenize(text)
def response(user_response): bot_response = '' questions = [ "berätta om björnar?", "berätta om katter?", "berätta om hundar?" ] # Frågor som AI:n utgår från för att avgöra vad användaren frågar efter. articles = [ 'https://sv.wikipedia.org/wiki/Bj%C3%B6rnar', 'https://sv.wikipedia.org/wiki/Katt', 'https://sv.wikipedia.org/wiki/Hund' ] # Länkar som AI:n hämtar information från för att svara på användarens frågor. results = [] index = 0 for question in questions: Question_similarity = SequenceMatcher( a=questions[index], b=user_response ).ratio( ) # Avgör hur lik användarens fråga är de frågor som AI:n utgår ifrån. results.append( Question_similarity ) # Resultaten för hur lik frågan är läggs till i en lista. index += 1 results_sorted = sorted(results, key=None, reverse=True) index = 0 for result in results: # Går igenom alla resultat och kollar om det är det bästa resultatet. if result == results_sorted[0]: article = Article( articles[index] ) # Bestämmer vilken länk där information som bäst svarar på användarens fråga finns. article.download() article.parse() article.nlp() text = article.text sent_tokens = nltk.sent_tokenize( text) #Konverterar artikelns text till en lista med meningar. remove_punct_dict = dict( (ord(punct), None) for punct in string.punctuation) break else: index += 1 sent_tokens.append(user_response) TfidVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english') tfidf = TfidVec.fit_transform(sent_tokens) vals = cosine_similarity( tfidf[-1], tfidf ) # Bestämmer hur lik användarens input är de olika meningarna i artikeln. idx = vals.argsort()[0][ -2] # Bestämmer meningen som är mest lik användarens input. flat = vals.flatten() flat.sort() score = flat[-2] if score < 0.1: bot_response = bot_response + "Jag förstår tyvärr inte." else: bot_response = bot_response + sent_tokens[ idx] # Om "score" är större än 0.1 svarar boten på användarens fråga." sent_tokens.remove(user_response) return bot_response
def root_from(root_urls, target_dir, delay=0.2): if not os.path.exists(target_dir): os.makedirs(target_dir) # This function is based on https://github.com/heximhotep/fakenews_scraper # it starts with a list of root urls # for each root url it gets other urls on that page and forks in this way, # scraping everything on the road # use this for home news server pages, but not for direct news download # you will get a lot of articles, but not all necessarily connected with the original search visited_urls = set([]) saved_articles = set([]) article_lengths = dict([]) while (True): if (len(root_urls) == 0): break root_url = root_urls[0] root_urls = root_urls[1:] # print(root_urls) if (root_url in visited_urls): continue else: visited_urls.add(root_url) root_paper = newspaper.build(root_url) print(root_url, 'size:', root_paper.size()) print('category urls count:', len(root_paper.category_urls())) adjacent_urls = root_paper.category_urls() for adj_url in adjacent_urls: if (adj_url in visited_urls): continue root_urls.append(adj_url) # print(root_urls) index = 0 visited_streak = 0 for carticle in root_paper.articles: if (visited_streak > 26): break article = Article(carticle.url) try: article.download() article.parse() article_name = fileName( None if article.authors == [] else article.authors[0], article.title, article.publish_date) if (article_name in saved_articles and len(article.text) <= article_lengths[article_name]): print('skipping article') visited_streak += 1 continue visited_streak = 0 article.nlp() saved_articles.add(article_name) article_lengths[article_name] = (len(article.text)) payload = { "url": article.url, "title": article.title, "content": article.text } features = { "content": { "keywords": [{ "keyword": word } for word in article.keywords] } } articleJSON = { "features": features, "url": article.url, "date": article.publish_date, "title": article.title, "authors": article.authors, "body": article.text } with open(target_dir + "/" + article_name, 'w') as outfile: json.dump(articleJSON, outfile, indent=2, default=str) print("saved article") except ArticleException: continue except FileNotFoundError: continue except OSError: continue except UnicodeError: continue except Exception: continue index += 1 print(index) time.sleep(delay)
class ArticleTestCase(unittest.TestCase): def setup_stage(self, stage_name): stages = OrderedDict([ ('initial', lambda: None), ('download', lambda: self.article.download( mock_resource_with('cnn_article', 'html'))), ('parse', lambda: self.article.parse()), ('meta', lambda: None), # Alias for nlp ('nlp', lambda: self.article.nlp()) ]) assert stage_name in stages for name, action in stages.items(): if name == stage_name: break action() def setUp(self): """Called before the first test case of this unit begins """ self.article = Article( url='http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch') @print_test def test_url(self): self.assertEqual( 'http://www.cnn.com/2013/11/27/travel/weather-' 'thanksgiving/index.html?iref=allsearch', self.article.url) @print_test def test_download_html(self): self.setup_stage('download') html = mock_resource_with('cnn_article', 'html') self.article.download(html) self.assertEqual(75406, len(self.article.html)) @print_test def test_meta_refresh_redirect(self): # TODO: We actually hit example.com in this unit test ... which is bad # Figure out how to mock an actual redirect config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('google_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'Example Domain') @print_test def test_meta_refresh_no_url_redirect(self): config = Configuration() config.follow_meta_refresh = True article = Article('', config=config) html = mock_resource_with('ap_meta_refresh', 'html') article.download(input_html=html) article.parse() self.assertEqual(article.title, 'News from The Associated Press') @print_test def test_pre_download_parse(self): """Calling `parse()` before `download()` should yield an error """ article = Article(self.article.url) self.assertRaises(ArticleException, article.parse) @print_test def test_parse_html(self): self.setup_stage('parse') AUTHORS = [ 'Chien-Ming Wang', 'Dana A. Ford', 'James S.A. Corey', 'Tom Watkins' ] TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving' LEN_IMGS = 46 META_LANG = 'en' self.article.parse() self.article.nlp() text = mock_resource_with('cnn', 'txt') self.assertEqual(text, self.article.text) self.assertEqual(text, fulltext(self.article.html)) # NOTE: top_img extraction requires an internet connection # unlike the rest of this test file TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-' '01-weather-1128-story-top.jpg') self.assertEqual(TOP_IMG, self.article.top_img) self.assertCountEqual(AUTHORS, self.article.authors) self.assertEqual(TITLE, self.article.title) self.assertEqual(LEN_IMGS, len(self.article.imgs)) self.assertEqual(META_LANG, self.article.meta_lang) self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date)) @print_test def test_meta_type_extraction(self): self.setup_stage('meta') meta_type = self.article.extractor.get_meta_type( self.article.clean_doc) self.assertEqual('article', meta_type) @print_test def test_meta_extraction(self): self.setup_stage('meta') meta = self.article.extractor.get_meta_data(self.article.clean_doc) META_DATA = defaultdict( dict, { 'medium': 'news', 'googlebot': 'noarchive', 'pubdate': '2013-11-27T08:36:32Z', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com', 'og': { 'site_name': 'CNN', 'description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article' }, 'section': 'travel', 'author': 'Dana A. Ford, James S.A. Corey, Chien-Ming Wang, and Tom Watkins, CNN', 'robots': 'index,follow', 'vr': { 'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html' }, 'source': 'CNN', 'fb': { 'page_id': 18793419640, 'app_id': 80401312489 }, 'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm', 'article': { 'publisher': 'https://www.facebook.com/cnninternational' }, 'lastmod': '2013-11-28T02:03:23Z', 'twitter': { 'site': { 'identifier': '@CNNI', 'id': 2097571 }, 'card': 'summary', 'creator': { 'identifier': '@cnntravel', 'id': 174377718 } }, 'viewport': 'width=1024', 'news_keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm' }) self.assertDictEqual(META_DATA, meta) # if the value for a meta key is another dict, that dict ought to be # filled with keys and values dict_values = [v for v in list(meta.values()) if isinstance(v, dict)] self.assertTrue(all([len(d) > 0 for d in dict_values])) # there are exactly 5 top-level "og:type" type keys is_dict = lambda v: isinstance(v, dict) self.assertEqual(5, len([i for i in meta.values() if is_dict(i)])) # there are exactly 12 top-level "pubdate" type keys is_string = lambda v: isinstance(v, str) self.assertEqual(12, len([i for i in meta.values() if is_string(i)])) @print_test def test_pre_download_nlp(self): """Test running NLP algos before even downloading the article """ self.setup_stage('initial') new_article = Article(self.article.url) self.assertRaises(ArticleException, new_article.nlp) @print_test def test_pre_parse_nlp(self): """Test running NLP algos before parsing the article """ self.setup_stage('parse') self.assertRaises(ArticleException, self.article.nlp) @print_test def test_nlp_body(self): self.setup_stage('nlp') self.article.nlp() KEYWORDS = [ 'balloons', 'delays', 'flight', 'forecasters', 'good', 'sailing', 'smooth', 'storm', 'thanksgiving', 'travel', 'weather', 'winds', 'york' ] SUMMARY = mock_resource_with('cnn_summary', 'txt') self.assertEqual(SUMMARY, self.article.summary) self.assertCountEqual(KEYWORDS, self.article.keywords)
def recommend_article(article_feed_url, tag): """ Takes a URL with articles as well as a tag and returns the recommended article title (for now) from this URL based on the tag. Currently selects the article with highest tag occurrences in its main text. :param article_feed_url: A URL :param tag: String :rtype: String """ list_of_article_titles = [] all_text = [] key_words = [] english_check = re.compile(r'[a-z]') if english_check.match(tag): # english print("This is a english website") if valid_url(article_feed_url) and url_is_alive(article_feed_url): article_urls = get_article_links(article_feed_url) try: for article_url in article_urls: if valid_url(article_url) == False or url_is_alive( article_url) == False: continue cur_article = Article(article_url, language='zh') cur_article.download() cur_article.parse() list_of_article_titles.append(cur_article.title) tag_frequency.append(cur_article.text.lower().count(tag)) all_text.append(cur_article.text.lower()) print("there are in total of {0} articles collected".format( len(list_of_article_titles))) except: print( "download limit exceeded... but the result so far is returned..." ) print("there are in total of ", len(list_of_article_titles), ' articles collected') else: return 'Bad URL' else: # chinese print("this is a chinese website") soup = simple_get(article_feed_url) try: for article in soup.findAll('a', href=True): if article.text and article['href'] and len( article.text.replace(' ', '')) >= 15: cur_article = Article(article_feed_url + article['href'][1:], language='zh') cur_article.download() cur_article.parse() cur_article.nlp() list_of_article_titles.append(cur_article.title) all_text.append(cur_article.text.lower()) key_words.append(cur_article.keywords) print("there are in total of ", len(list_of_article_titles), ' articles collected') print("These are the titles of found articles: ", list_of_article_titles) except: print( "download limit exceeded... but the result so far is returned..." ) print("there are in total of ", len(list_of_article_titles), ' articles collected') if not all_text: return None # create vector representation of our articles vectorizer = CountVectorizer() X = vectorizer.fit_transform(all_text) # create binary target variable (whether or not the tag is in the keywords) y = [] for keywords in key_words: check = False for keyword in keywords: if tag in keyword: check = True break if check: y.append(1) else: y.append(0) # build logistic regression model to find article with highest probability clf = LogisticRegression().fit(X, y) article_probs = clf.predict_proba(X)[:, 1] return list_of_article_titles[np.argmax(article_probs)]
def post(self): data = json.loads(self.request.body.decode('utf-8')) # data = {} # tornado.httputil.parse_body_arguments(self.request.headers["Content-Type"], self.request.body, data) # # logging.getLogger().debug("args={}".format(data)) action = data.get('action') # action = action[0] # action = action.decode('utf-8') email = data.get('email') # email = email[0] # email = email.decode('utf-8') if action == 'load_page': id = get_user(email) id = id[0][0] full_summarization = get_mysummary(id) full = [] for data in full_summarization: send_summary = '' print(data[4]) for i in range(3): number = i + 1 send_summary = send_summary + '<p style = "margin: 10px 0px 10px 0px">' + data[ 4].split( '<p style = "margin: 10px 0px 10px 0px">')[number] print(send_summary) full.append({ 'title': data[0], 'link': data[1], 'photo': data[2], 'keywords': data[3], 'summary': send_summary, 'date': data[5] }) full.reverse() self.write(json.dumps({'summary': full})) elif action == 'summary': try: url = data.get('url') # url = url[0] # url = url.decode('utf-8') url = re.sub(' ', '', url) article = Article(url, language='en') article.download() article.parse() title = article.title if detect(title) != 'en' or detect(article.text) != 'en': self.write( json.dumps( {'result': 'This language will be supported soon'})) else: try: image = article.top_image except Exception: image = '' article.nlp() try: keywords = article.keywords keywords = ','.join(keywords) except Exception: keywords = 'Sorry,no,keywords,found' try: summary = article.summary summary = '<p style = "margin: 10px 0px 10px 0px">' + re.sub( r'\.', r'.</p><p style = "margin: 10px 0px 10px 0px">', summary) summary = summary[:-40] except Exception: summary = 'Sorry, no summmary found' try: publish_date = article.publish_date publish_date = publish_date.date() except Exception: publish_date = 'XII b.c.' if url[-1] == '/': summarized = { 'title': title, 'link': url, 'photo': image, 'keywords': keywords, 'summary': str(summary), 'date': str(publish_date) } else: url = url + '/' summarized = { 'title': title, 'link': url, 'photo': image, 'keywords': keywords, 'summary': str(summary), 'date': str(publish_date) } id = get_user(email) id = id[0][0] result = main_summarization(summarized, id) send_summary = '' for i in range(3): number = i + 1 send_summary = send_summary + '<p style = "margin: 10px 0px 10px 0px">' + summary.split( '<p style = "margin: 10px 0px 10px 0px">')[number] summarized = { 'title': title, 'link': url, 'photo': image, 'keywords': keywords, 'summary': send_summary, 'date': str(publish_date) } if result == 'You have this result': jsn = {'result': result} self.write(json.dumps(jsn)) else: self.write(json.dumps(summarized)) except Exception: jsn = {'result': 'This URL is unsummarizable'} self.write(json.dumps(jsn)) elif action == 'delete': url = data.get('url') id = get_user(email) id = id[0][0] main_delete(id, url) self.write(json.dumps({'result': 'done'}))
def get_stocknews_byticker(self, tickersList, nitems=50, daysback=30, sortby='trending'): assert (sortby in ['trending', 'algo']) tickers = str(tickersList).replace('[', '').replace(']', '').replace( "'", '').replace(' ', '') urlInstructions = { 'ticker': tickers, 'nitems': nitems, 'fromdate_MMDDYYYY': (date.today() - datetime.timedelta(days=daysback)).strftime('%m%d%Y'), 'sortby': sortby, 'today': date.today(), } outfileName = 'Finance/temp/{ticker}-{nitems}-{fromdate_MMDDYYYY}-{sortby}-{today}.json'.format( **urlInstructions) text = self.bqu.read_string_from_gcp(self.bucketName, outfileName) if text is None: url = self.stocknews_url_template.format(**urlInstructions) print(url) response = requests.request("GET", url) text = response.text self.bqu.upload_string_to_gcp(response.text, self.bucketName, outfileName) data = json.loads(text) newsDict = data['data'] sentimentDict = { 'Count': 0, 'Negative': 0, 'Positive': 0, 'Neutral': 0, 'Weighted': 0 } sentimentWeight = {'Negative': -1, 'Positive': 1, 'Neutral': 0} count = 0 newsFeed = [] startTime = dt.utcnow() for newsItem in newsDict: count += 1 newItem = { key: newsItem[key] for key in [ 'title', 'news_url', 'text', 'sentiment', 'source_name', 'topics' ] } newItem['index'] = count itemDate = dt.strptime(newsItem['date'], '%a, %d %b %Y %H:%M:%S %z') delta = startTime.date() - itemDate.date() if delta.days <= 3 or count <= 3: newItem['date'] = str(itemDate.date()) if False: # suspend getting the summary article = Article(newItem['news_url']) # Do some NLP try: article.download() # Downloads the link’s HTML content article.parse() # Parse the article article.nlp() # Keyword extraction wrapper newItem['Summary'] = article.summary.replace( '\n', '\n') except Exception as e: print('Error occured:', e) newItem['Summary'] = "<...>" #print(newItem['Summary']) newsFeed.append(newItem) if delta.days <= 3: deltaWeight = 1 elif delta.days <= 7: deltaWeight = 0.5 elif delta.days <= 14: deltaWeight = 0.25 elif delta.days <= 30: deltaWeight = 0.125 else: deltaWeight = 0.05 sentiment = newsItem['sentiment'] sentimentDict[sentiment] += 1 sentimentDict['Count'] += 1 sentimentDict[ 'Weighted'] += sentimentWeight[sentiment] * deltaWeight retDict = { 'NumItems': len(newsFeed), 'Sentiment': sentimentDict, 'Newsfeed': newsFeed, } return retDict