Пример #1
0
def text_from_urls(query):
    newd = {}
    for (from_dt, to_dt) in zip(from_list, to_list):
        all_articles = newsapi.get_everything(q=query,
                                              language='en',
                                              sort_by='relevancy',
                                              from_param=from_dt,
                                              to=to_dt)
        d = json_normalize(all_articles['articles'])
        newdf = d[["url", "publishedAt", "source.name", "author"]]
        newdf = newdf.head(1)
        #print(newdf.head())
        dic = newdf.set_index(["source.name", "publishedAt",
                               "author"])["url"].to_dict()
        #print(dic)
        for (k, v) in dic.items():
            #print(str(k[0])+str(k[1][5:10]))
            page = requests.get(v)
            html = page.content
            soup = BeautifulSoup(html, "lxml")
            text = soup.get_text()
            d2 = soup.find_all("p")
            #for a in d2:
            newd[k] = re.sub(r'<.+?>', r'', str(d2))
    return newd
Пример #2
0
def func(query):
    newdf = pd.DataFrame()
    #query=match.groups()[0]
    for (from_dt, to_dt) in zip(from_list, to_list):
        all_articles = newsapi.get_everything(q=query,
                                              language='en',
                                              sort_by='relevancy',
                                              from_param=from_dt,
                                              to=to_dt)
        d = json_normalize(all_articles['articles'])
        newdf = newdf.append(d)

    return newdf
def get_newsapi_articles():
    try:
        key = open(NEWS_API_KEY_PATH, 'r').read()
    except FileNotFoundError:
      print("Votre clef personnelle de connexion à News API n'a pas été trouvée.  \
            Veuillez la récupérer sur 'https://newsapi.org/register'  \
            et l'ajouter à './articles/key_newsapi'.")

    newsapi = NewsApiClient(api_key=key[0:-1]) # API-KEY (do not release it) 

    data = newsapi.get_everything(q=SEARCH_KEYWORD, language=SEARCH_LANGUAGE, page_size=OUTPUT_ARTICLES_NUMBER)

    with open(OUTPUT_FILENAME, 'w') as outfile:
        json.dump(data, outfile, indent = 4)
Пример #4
0
article = Article(url)
# Let's calculate the score with an "automatic" method : key words in the whole article
# Getting the article
article.download()
article.parse()
article.nlp()
# Getting the key words
key_words = article.keywords
for i in range(5):
    word_list = list(key_words[i])
    word_list.insert(
        0, '+'
    )  # we absolutely want those words in the article : putting + before make it mandatory.
    key_words[i] = ''.join(word_list)
# Finding the articles in the data base that match the three first key-words (in their core)
all_articles = newsapi.get_everything(q=(key_words[0] + key_words[1] +
                                         key_words[2]))
# Getting the number of articles, ie. the score
score_1 = all_articles.get("totalResults")
# Let's calculate the score with a "manual" method  : key words in the title.
titre = article.title
# Transforming the title into a txt file
text_file = open('titre.txt', "w")
text_file.write(titre)
text_file.close()
# Getting the key_words : using sklearn and TfidfVectorizer
titre = open('titre.txt')
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(titre)
titre.close()
# Transforming X into an array : X contains the weight of each word in the title
X = X.toarray()
Пример #5
0
class news_text():
    newsapi = NewsApiClient(api_key='1ca90686b682467a97477cdef14ef436')
    everything = newsapi.get_everything(sources='financial-post',
                                        language='en')

    def assign_data(self):
        completearticles = []
        articles = []
        titles = []
        urls = []
        imgurls = []
        dictionaries = self.everything["articles"]
        for dic in dictionaries:
            text = (dic["content"])
            completearticles.append(dic["title"] + ". " + text)
            articles.append(text)
            titles.append(dic["title"])
            urls.append(dic["url"])
            imgurls.append(dic["urlToImage"])
        self.completearticles = completearticles
        self.articles = articles
        self.titles = titles
        self.urls = urls
        self.imgurls = imgurls

    def return_articles(self):
        return (self.articles)

    def return_titles(self):
        return (self.titles)

    def return_urls(self):
        return (self.urls)

    def return_imgurls(self):
        return (self.imgurls)

    #preprocessing step before converting to vectors
    def preprocess_text(self, text):
        textlist = text.split("… [+")
        text2 = textlist[0]
        text2 = text2.replace("\r", " ")
        text2 = text2.replace("\n", " ")
        textlist = text2.split(" ")
        textlist = [text for text in textlist if text != ""]
        textlist = textlist[:len(textlist) - 1]
        text = " ".join(textlist)
        text = text.lower()
        tokenizer = RegexpTokenizer(r'\w+')  #tokenize words
        tokens = tokenizer.tokenize(text)
        punctuation = list(string.punctuation)
        stoplist = stopwords.words('english')
        stoplist = set(stoplist)  #like a list, but can use hash table
        tokens = [WordNetLemmatizer().lemmatize(token)
                  for token in tokens]  #lemmatize all tokens
        tokens = [w for w in tokens if not w.isdigit()]  #remove digits
        tokens = [w for w in tokens
                  if len(w) > 2]  #remove words having 2 or less chars
        tokens = [w for w in tokens
                  if not w in punctuation]  #remove punctuations
        tokens = [w for w in tokens if not w in stoplist]  #remove stopwords
        #     stemmed = [sno.stem(words) for words in filtered_words]
        return (" ".join(tokens)
                )  #remove large sentence with all purified words

    def return_processed_texts(self):
        articles = np.array(self.completearticles)
        processed_articles = [self.preprocess_text(text) for text in articles]
        return (processed_articles)
client = MongoClient()
db = client.dbasenews
ncollec = db.ncollec

# authorization for the news client

newsapi = NewsApiClient(api_key='77a15f1c760b48799bb2186f0c5fd142')

# opening necessary files

source = open('TextsExtracted.txt', 'r')
destination = open('NewsExtracted.txt', "w")

# for each word in the text file, we retrieve a single news article based on relevancy and store it in a file and in a database

for word in source:
    all_articles = newsapi.get_everything(q=word,
                                          language='en',
                                          from_parameter="2018-03-01",
                                          sort_by="relevancy",
                                          page_size=1)
    print(all_articles)
    json.dump(all_articles, destination)
    destination.write("\n")
    try:
        ncollec.insert(all_articles)
    except:
        pass

# works for only first 1000 words.. beyond that.. paid plan is necessary
with open("source.txt", "r") as file:
    for line in file:
        i = 0
        for ch in line:
            letters[i] += ch
            i = i + 1
        j = j + 1
file.close()
text_file = open("source_sans_https", 'w')
i = 0
while letters[i] != '':
    #utiliser replace à la place de la boucle
    if letters[i] == 'w' and letters[i + 1] == 'w' and letters[
            i + 2] == 'w' and letters[i + 3] == '.':
        j = 4
        while letters[i + j] != '':
            n = text_file.write(letters[i + j])
            j = j + 1
    i = i + 1
text_file.close()
source = open("source_sans_https", 'r')
url_a_rechercher = source.readline()
print(url_a_rechercher)
recherche = newsapi.get_everything(domains=url_a_rechercher)
print(recherche)
# key_words = article.keywords
# print(key_words)
#
# all_articles = newsapi.get_everything(q=(key_words[0] and key_words[1] and key_words[2] and key_words[3] and key_words[4] and key_words[5] and key_words[6] and key_words[7] and key_words[8] and key_words[9] and key_words[10] and key_words[11] and key_words[12] and key_words[13]))
# print(all_articles)
# Total_number= all_articles.get("totalResults")
Пример #8
0
def fetch_news() -> dict:
    query = input("What do you want to hear about? ")
    return newsapi.get_everything(q=query)
Пример #9
0
        fig.layout.updatemenus[0].buttons[0].args[1]["frame"][
            "duration"] = 2000
        fig.update_yaxes(automargin=True)
        fig.update_layout(autosize=True)
        fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
        fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

        st.write("Analyst recommend this")
        st.plotly_chart(fig)

        st.markdown("**" + "Related news" + "**")
        for (from_dt, to_dt) in zip(from_list, to_list):
            all_articles = newsapi.get_everything(q=str(tickerSymbol),
                                                  language='en',
                                                  sort_by='relevancy',
                                                  page_size=3,
                                                  page=1,
                                                  from_param=from_dt,
                                                  to=to_dt)
            newdf = json_normalize(all_articles['articles'])
            #newdf=d[["url","source.name","title","content"]]
            st.write("***" + newdf['title'].values[0] + "***")
            st.write(newdf['content'].values[0] + "\n\n" +
                     "You can find more about it here: " +
                     newdf['url'].values[0] + "\n")
        # st.write("***"+"2] "+newdf['title'].values[1]+"***")

        #st.write(newdf['content'].values[1]+"\n\n"+"You can find more about it here: "+newdf['url'].values[1]+"\n")

        #st.write("***"+"3] "+newdf['title'].values[2]+"***")
        #st.write(str(newdf['content'].values[2])+"\n\n"+"You can find more about it here: "+str(newdf['url'].values[2])+"\n")