def text_from_urls(query): newd = {} for (from_dt, to_dt) in zip(from_list, to_list): all_articles = newsapi.get_everything(q=query, language='en', sort_by='relevancy', from_param=from_dt, to=to_dt) d = json_normalize(all_articles['articles']) newdf = d[["url", "publishedAt", "source.name", "author"]] newdf = newdf.head(1) #print(newdf.head()) dic = newdf.set_index(["source.name", "publishedAt", "author"])["url"].to_dict() #print(dic) for (k, v) in dic.items(): #print(str(k[0])+str(k[1][5:10])) page = requests.get(v) html = page.content soup = BeautifulSoup(html, "lxml") text = soup.get_text() d2 = soup.find_all("p") #for a in d2: newd[k] = re.sub(r'<.+?>', r'', str(d2)) return newd
def func(query): newdf = pd.DataFrame() #query=match.groups()[0] for (from_dt, to_dt) in zip(from_list, to_list): all_articles = newsapi.get_everything(q=query, language='en', sort_by='relevancy', from_param=from_dt, to=to_dt) d = json_normalize(all_articles['articles']) newdf = newdf.append(d) return newdf
def get_newsapi_articles(): try: key = open(NEWS_API_KEY_PATH, 'r').read() except FileNotFoundError: print("Votre clef personnelle de connexion à News API n'a pas été trouvée. \ Veuillez la récupérer sur 'https://newsapi.org/register' \ et l'ajouter à './articles/key_newsapi'.") newsapi = NewsApiClient(api_key=key[0:-1]) # API-KEY (do not release it) data = newsapi.get_everything(q=SEARCH_KEYWORD, language=SEARCH_LANGUAGE, page_size=OUTPUT_ARTICLES_NUMBER) with open(OUTPUT_FILENAME, 'w') as outfile: json.dump(data, outfile, indent = 4)
article = Article(url) # Let's calculate the score with an "automatic" method : key words in the whole article # Getting the article article.download() article.parse() article.nlp() # Getting the key words key_words = article.keywords for i in range(5): word_list = list(key_words[i]) word_list.insert( 0, '+' ) # we absolutely want those words in the article : putting + before make it mandatory. key_words[i] = ''.join(word_list) # Finding the articles in the data base that match the three first key-words (in their core) all_articles = newsapi.get_everything(q=(key_words[0] + key_words[1] + key_words[2])) # Getting the number of articles, ie. the score score_1 = all_articles.get("totalResults") # Let's calculate the score with a "manual" method : key words in the title. titre = article.title # Transforming the title into a txt file text_file = open('titre.txt', "w") text_file.write(titre) text_file.close() # Getting the key_words : using sklearn and TfidfVectorizer titre = open('titre.txt') vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(titre) titre.close() # Transforming X into an array : X contains the weight of each word in the title X = X.toarray()
class news_text(): newsapi = NewsApiClient(api_key='1ca90686b682467a97477cdef14ef436') everything = newsapi.get_everything(sources='financial-post', language='en') def assign_data(self): completearticles = [] articles = [] titles = [] urls = [] imgurls = [] dictionaries = self.everything["articles"] for dic in dictionaries: text = (dic["content"]) completearticles.append(dic["title"] + ". " + text) articles.append(text) titles.append(dic["title"]) urls.append(dic["url"]) imgurls.append(dic["urlToImage"]) self.completearticles = completearticles self.articles = articles self.titles = titles self.urls = urls self.imgurls = imgurls def return_articles(self): return (self.articles) def return_titles(self): return (self.titles) def return_urls(self): return (self.urls) def return_imgurls(self): return (self.imgurls) #preprocessing step before converting to vectors def preprocess_text(self, text): textlist = text.split("… [+") text2 = textlist[0] text2 = text2.replace("\r", " ") text2 = text2.replace("\n", " ") textlist = text2.split(" ") textlist = [text for text in textlist if text != ""] textlist = textlist[:len(textlist) - 1] text = " ".join(textlist) text = text.lower() tokenizer = RegexpTokenizer(r'\w+') #tokenize words tokens = tokenizer.tokenize(text) punctuation = list(string.punctuation) stoplist = stopwords.words('english') stoplist = set(stoplist) #like a list, but can use hash table tokens = [WordNetLemmatizer().lemmatize(token) for token in tokens] #lemmatize all tokens tokens = [w for w in tokens if not w.isdigit()] #remove digits tokens = [w for w in tokens if len(w) > 2] #remove words having 2 or less chars tokens = [w for w in tokens if not w in punctuation] #remove punctuations tokens = [w for w in tokens if not w in stoplist] #remove stopwords # stemmed = [sno.stem(words) for words in filtered_words] return (" ".join(tokens) ) #remove large sentence with all purified words def return_processed_texts(self): articles = np.array(self.completearticles) processed_articles = [self.preprocess_text(text) for text in articles] return (processed_articles)
client = MongoClient() db = client.dbasenews ncollec = db.ncollec # authorization for the news client newsapi = NewsApiClient(api_key='77a15f1c760b48799bb2186f0c5fd142') # opening necessary files source = open('TextsExtracted.txt', 'r') destination = open('NewsExtracted.txt', "w") # for each word in the text file, we retrieve a single news article based on relevancy and store it in a file and in a database for word in source: all_articles = newsapi.get_everything(q=word, language='en', from_parameter="2018-03-01", sort_by="relevancy", page_size=1) print(all_articles) json.dump(all_articles, destination) destination.write("\n") try: ncollec.insert(all_articles) except: pass # works for only first 1000 words.. beyond that.. paid plan is necessary
with open("source.txt", "r") as file: for line in file: i = 0 for ch in line: letters[i] += ch i = i + 1 j = j + 1 file.close() text_file = open("source_sans_https", 'w') i = 0 while letters[i] != '': #utiliser replace à la place de la boucle if letters[i] == 'w' and letters[i + 1] == 'w' and letters[ i + 2] == 'w' and letters[i + 3] == '.': j = 4 while letters[i + j] != '': n = text_file.write(letters[i + j]) j = j + 1 i = i + 1 text_file.close() source = open("source_sans_https", 'r') url_a_rechercher = source.readline() print(url_a_rechercher) recherche = newsapi.get_everything(domains=url_a_rechercher) print(recherche) # key_words = article.keywords # print(key_words) # # all_articles = newsapi.get_everything(q=(key_words[0] and key_words[1] and key_words[2] and key_words[3] and key_words[4] and key_words[5] and key_words[6] and key_words[7] and key_words[8] and key_words[9] and key_words[10] and key_words[11] and key_words[12] and key_words[13])) # print(all_articles) # Total_number= all_articles.get("totalResults")
def fetch_news() -> dict: query = input("What do you want to hear about? ") return newsapi.get_everything(q=query)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"][ "duration"] = 2000 fig.update_yaxes(automargin=True) fig.update_layout(autosize=True) fig.update_traces(texttemplate='%{text:.2s}', textposition='outside') fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide') st.write("Analyst recommend this") st.plotly_chart(fig) st.markdown("**" + "Related news" + "**") for (from_dt, to_dt) in zip(from_list, to_list): all_articles = newsapi.get_everything(q=str(tickerSymbol), language='en', sort_by='relevancy', page_size=3, page=1, from_param=from_dt, to=to_dt) newdf = json_normalize(all_articles['articles']) #newdf=d[["url","source.name","title","content"]] st.write("***" + newdf['title'].values[0] + "***") st.write(newdf['content'].values[0] + "\n\n" + "You can find more about it here: " + newdf['url'].values[0] + "\n") # st.write("***"+"2] "+newdf['title'].values[1]+"***") #st.write(newdf['content'].values[1]+"\n\n"+"You can find more about it here: "+newdf['url'].values[1]+"\n") #st.write("***"+"3] "+newdf['title'].values[2]+"***") #st.write(str(newdf['content'].values[2])+"\n\n"+"You can find more about it here: "+str(newdf['url'].values[2])+"\n")