Пример #1
0
def getsearchresult():
    searchup = defaultdict(list)
    newsapi = NewsApiClient(api_key="d11761b89fdb4599b1497bf951690000")
    keyword_ = request.args.get('keyword')
    print("keyword", keyword_)
    from_ = request.args.get('from')
    print("keyword", from_)
    to_ = request.args.get('todate')
    print("keyword", to_)
    source_ = request.args.get('source')
    print("keyword", source_)

    if source_ == "all":
        try:
            final_result = newsapi.get_everything(q=keyword_,
                                                  from_param=from_,
                                                  to=to_,
                                                  language="en",
                                                  page_size=30,
                                                  sources="",
                                                  sort_by="publishedAt")
        except NewsAPIException as error:
            return str(error)
    else:
        try:
            final_result = newsapi.get_everything(q=keyword_,
                                                  from_param=from_,
                                                  to=to_,
                                                  sources=source_,
                                                  language="en",
                                                  page_size=30,
                                                  sort_by="publishedAt")
        except NewsAPIException as error:

            return str(error)

    data = final_result["articles"]
    for i in data:
        if i['title'] is None or i['title'] == "" or i['title'] == "null" or i[
                'author'] is None or i['author'] == "" or i[
                    'author'] == "null" or i['description'] is None or i[
                        'description'] == "" or i['description'] == "null" or i[
                            'source'] is None or i['source'] == "" or i[
                                'source'] == "null" or i['url'] is None or i[
                                    'url'] == "" or i['url'] == "null" or i[
                                        'urlToImage'] is None or i[
                                            'urlToImage'] == "" or i[
                                                'urlToImage'] == "null" or i[
                                                    'publishedAt'] is None or i[
                                                        'publishedAt'] == "" or i[
                                                            'publishedAt'] == "null":
            pass
        else:
            searchup['articles'].append(i)

    return jsonify(searchup)
def getCryptoNews(startDate,endDate):
    try:
        newsapi = NewsApiClient(api_key='ef9f89cce9b24cfe9ed9b61f900cc1b1')
        news =[]
        bitcoin_articles = newsapi.get_everything(q='bitcoin',
                                              sources='crypto-coins-news,bloomberg,reuters,google-news',
                                              domains='cnn,bloomberg,reuters,google',
                                              from_param = startDate,
                                              to = endDate,
                                              language='en')
        BTC_articles = newsapi.get_everything(q='btc',
                                              sources='crypto-coins-news,bloomberg,reuters,google-news',
                                              domains='cnn,bloomberg,reuters,google',
                                              from_param=startDate,
                                              to=endDate,
                                              language='en')
        crypto_articles = newsapi.get_everything(q='cryptocurrency',
                                      sources='crypto-coins-news,bloomberg,reuters,google-news',
                                      domains='cnn,bloomberg,reuters,google',
                                      from_param=startDate,
                                      to=endDate,
                                      language='en')
        blockchain_articles = newsapi.get_everything(q='blockchain',
                                      sources='crypto-coins-news,bloomberg,reuters,google-news',
                                      domains='cnn,bloomberg,reuters,google',
                                      from_param=startDate,
                                      to=endDate,
                                      language='en')





        for item in  bitcoin_articles['articles']:
            news.append(item)
            
        for item in  BTC_articles['articles']:
            news.append(item)
        
        for item in  crypto_articles['articles']:
            news.append(item)
        for item in  blockchain_articles['articles']:
            news.append(item)
    
    except :
        print('Error in Reading BTC News!')
    
    return (news)
Пример #3
0
def customsearchresults(request):
    user_query = request.GET['search']
    newsapi = NewsApiClient(api_key='e714e075a7534f85b7e0bdfd2330c611')
    all_articles = newsapi.get_everything(q=user_query,
                                          language='en',
                                          sort_by='relevancy')
    all_articles = all_articles['articles']
    json_content = []
    for i in range(len(all_articles)):
        news_object = all_articles[i]

        # r1 = requests.get(news_object['url'])
        # text = r1.content
        # soup = BeautifulSoup(text, 'html.parser')
        # paragraph_list = soup.find_all('p')
        # whole_content = ""
        # json_content = []
        # for item in range(len(paragraph_list)):
        #     whole_content = whole_content + " " + paragraph_list[item].get_text()

        custom_object = {
            "heroes": "To be decided",
            "victim": "To be decided",
            "villian": "To be decided",
            "source": news_object["source"]["name"],
            "author": news_object["author"],
            "title": news_object['title'],
            "shortdescription": news_object['description'],
            "urlToImage": news_object["urlToImage"],
            "url": news_object["url"],
        }
        json_content.append(custom_object)

    return Response(json_content)
Пример #4
0
 def get(self,
         fields=["image", "title", "image", "link"],
         limit=20,
         **kargs):
     newsapi = NewsApiClient(api_key=self.__key)
     all_news = newsapi.get_everything(**kargs)
     return self.__transformDate(all_news.get("articles"), fields, limit)
Пример #5
0
def gen_sentiment_df(stock="DJI"):
    newsapi = NewsApiClient(api_key='f8970a68f49e43a18c9b5aff8e2bcfe1')
    a = date(2020, 2, 29)
    b = date(2020, 3, 27)
    sentiments = {}
    query = "stocks & " + stock
    for dt in rrule(DAILY, dtstart=a, until=b):
        str_date = str(dt.strftime("%Y-%m-%d"))
        all_articles = newsapi.get_everything(q=query,
                                              from_param=str_date,
                                              to=str_date,
                                              language='en',
                                              sort_by='relevancy',
                                              page=1)
        headlines = ""
        for a in all_articles['articles']:
            if isinstance(a["title"], str):
                headlines += a["title"]
        i = analyze_sentiment(headlines)
        sentiments[dt] = {i.magnitude, i.score}
    sentiment_df = pd.DataFrame(list(sentiments.values()),
                                columns=["magnitude", "score"],
                                index=sentiments.keys())
    sentiment_df = sentiment_df.fillna(0)
    return sentiment_df
    def get_news(self):
        """
        This function makes a request using authenticated api and instance argument to extract news articles.
        :return: dataframe with articles and other metadata relevant to each article
        """
        start = (datetime.today() - timedelta(days=7)).strftime('%Y-%m-%d')
        end = datetime.today().strftime('%Y-%m-%d')
        # Initialise NewsApiClient with an api key
        newsapi = NewsApiClient(api_key=nc.api_key)

        query = ' '.join(
            ['(' + ' OR '.join([query for query in self.queries]) + ')'])

        # Query for articles using keyword
        all_articles = newsapi.get_everything(q=query,
                                              from_param=start,
                                              to=end,
                                              language='en',
                                              sort_by='relevancy',
                                              page_size=100)
        # Extract articles from returned json and store in articles variable
        articles = all_articles['articles']
        # Convert articles into dataframe
        articles_df = pd.DataFrame(articles)
        # Use only name part in the source columns
        articles_df['source'] = articles_df.source.map(lambda x: x['name'])
        # Select relevant columns for analysis
        articles_df = articles_df[[
            'source', 'title', 'url', 'publishedAt', 'content'
        ]]
        articles_df.columns = [
            'Source', 'Title', 'Url', 'Published', 'Content'
        ]

        return articles_df
Пример #7
0
def newsapi(stock):
    # newsapi_symbol = input("Enter a symbol")
    newsapi = NewsApiClient(api_key='861ff0ffbaaa4eaa9571ce516cc5e088')

    all_articles = newsapi.get_everything(q=stock,
                                          language='en',
                                          sort_by='publishedAt',
                                          page_size=100)

    sources = newsapi.get_sources()

    title = []
    desc = []

    i = 1
    pos, neg, neu = 0, 0, 0

    for article in all_articles['articles']:
        a = str(article['content'])
        title.append(
            str(article['title']) + ' : \n' + str(article['description']))
        # desc.append(str(article['description']))
        b = article['source']
        c = article['publishedAt']
        # print(i, a)
        i += 1

        analysis = TextBlob(a)
        if analysis.sentiment.polarity > 0:
            # print('\nPositive:\n', a)
            # print('The source is:', b['name'])
            # print('It was published at:', c)
            pos += 1

        elif analysis.sentiment.polarity == 0:
            # print('\nNeutral:\n', a)
            # print('The source is:', b['name'])
            # print('It was published at:', c)
            neu += 1

        else:
            # print('\nNegative:\n', a)
            # print('The source is:', b['name'])
            # print('It was published at:', c)
            neg += 1

    # print(title)

    total = pos + neg + neu
    pos_news, neg_news, neu_news = pos / total, neg / total, neu / total

    if pos_news - neg_news > 0:
        # print('\nThe net value of News is: ', (pos_news - neg_news + 1)/2)
        output = ((pos_news - neg_news + 1) * 100) / 2
    else:
        # print("\nThe net value of News is: ", (pos_news - neg_news + 1)/2)
        output = ((pos_news - neg_news + 1) * 100) / 2
    # print(output)
    return output, title, desc
Пример #8
0
def searchresults(request):
    user_query = request.GET['search']
    newsapi = NewsApiClient(api_key='e714e075a7534f85b7e0bdfd2330c611')
    all_articles = newsapi.get_everything(q=user_query,
                                          language='en',
                                          sort_by='relevancy')

    return Response(all_articles)
Пример #9
0
def news():
    if request.method == 'GET':
        return render_template('news.html')
    else:
        # request was a POST
        application.vars['keyword'] = request.form['keyword']
        application.vars['earliest'] = request.form['earliest']
        application.vars['latest'] = request.form['latest']

        newsapi = NewsApiClient(api_key='dc919a5aeb324f01b0db89373fd71749')
        keyword = application.vars['keyword']
        oldest = application.vars['earliest']
        latest = application.vars['latest']
        print(keyword)

        articles_page = newsapi.get_everything(q=keyword,
                                               from_param=oldest,
                                               to=latest,
                                               language='en',
                                               sort_by='popularity')
        print(articles_page['totalResults'])
        total = articles_page['totalResults']
        maxpage = math.ceil(total / 20)
        articles = []
        print(maxpage)

        for i in range(1, maxpage):
            articles_page = newsapi.get_everything(q=keyword,
                                                   from_param=oldest,
                                                   to=latest,
                                                   language='en',
                                                   sort_by='popularity',
                                                   page=i)
            articles.extend(articles_page['articles'])

        articles_df = pd.DataFrame(articles)
        print(len(articles_df))
        print(articles_df)
        resp = make_response(articles_df.to_csv())
        resp.headers[
            "Content-Disposition"] = "attachment; filename=export_news.csv"
        resp.headers["Content-Type"] = "text/csv"
        return resp
Пример #10
0
def get_articles(stock):
    newsapi = NewsApiClient(api_key='861ff0ffbaaa4eaa9571ce516cc5e088')

    all_articles = newsapi.get_everything(q=stock,
                                          language='en',
                                          sort_by='publishedAt',
                                          page_size=100)

    print(all_articles)

    return all_articles
Пример #11
0
def get_news():

    # with app.app_context():

    newsapi = NewsApiClient(api_key='29b0d1fda8b6452fb4df7d86a3dc5b9a')
    data = newsapi.get_everything(q='health and fitness',
                                  language='en',
                                  page_size=20)
    articles = data['articles']
    # print(type(articles[0])) It showed that it is of dict type.
    articles_json = json.dumps(articles)
    return articles_json
def getInfo(query):
    #Key to access GoogleNews API
    query = query.lower()
    # query = urllib.parse.quote_plus(query)
    newsapi = NewsApiClient(api_key='edf0afe93d6644d198d8539e640134c9')
    # print(query)

    headlines = newsapi.get_top_headlines(q=query, language='en')
    # print(headlines)
    # headlines = newsapi.get_top_headlines(q=query, language='en')

    newsTitles = list()
    newsContent = list()
    newsSources = list()
    newsURL = list()
    # print("number of articles found = " + str(len(headlines['articles'])))

    #Adds all relevant information to separate lists
    numberOfArticles = len(headlines['articles'])
    if numberOfArticles > 5:
        numberOfArticles = 5
    for x in range(numberOfArticles):
        source = headlines['articles'][x]['source']['name']
        if source == "Google News" or source == "Reuters" or source == "Financial Times":
            print(source)
            # x -= 1
            continue
        newsTitles.append(headlines['articles'][x]['title'])
        newsContent.append(headlines['articles'][x]['content'])
        newsSources.append(headlines['articles'][x]['source']['name'])
        newsURL.append(headlines['articles'][x]['url'])

    if len(newsTitles) < 5:
        today = datetime.datetime.today()
        start_day = today - datetime.timedelta(days=1)
        headlines_all = newsapi.get_everything(q=query,
                                               from_param=str(start_day),
                                               to=str(today),
                                               language='en',
                                               sort_by='relevancy')
        for x in range(5 - len(newsTitles)):
            source = headlines_all['articles'][x]['source']['name']
            if source == "Google News" or source == "Reuters" or source == "Financial Times":
                print(source)
                # x -= 1
                continue
            newsTitles.append(headlines_all['articles'][x]['title'])
            newsContent.append(headlines_all['articles'][x]['content'])
            newsSources.append(headlines_all['articles'][x]['source']['name'])
            newsURL.append(headlines_all['articles'][x]['url'])

    return newsTitles, newsContent, newsSources, newsURL
def search_articles(query_string, domain_blacklist_string,
                    domain_whitelist_string):
    newsapi = NewsApiClient(api_key='391c4cadc42a4a42aaf1ea266df4adfc')

    headlines = newsapi.get_everything(
        q=query_string,
        language='en',
        sort_by='relevancy',
        page_size=100,
        domains=domain_whitelist_string
        # exclude_domains=domain_blacklist_string
    )
    return headlines
Пример #14
0
def get_everything(keyword, fromdate, currentdate):
    newsapi = NewsApiClient(api_key='')
    articles = newsapi.get_everything(q=keyword,
                                      from_param=fromdate,
                                      to=currentdate,
                                      language='en',
                                      sort_by='relevancy',
                                      page_size=20)

    content = []
    if articles['totalResults'] > 20:
        for article in articles['articles']:
            detail = {'title': article['title'], 'url': article['url'], 'source': article['source']['name']}
            content.append(detail)

    return content
Пример #15
0
def newsfeed(area):  #Area specific NewsFeed
    today = datetime.utcnow().date()
    duration = datetime.utcnow().date() - timedelta(days=5)

    #API KEY

    news = NewsApiClient(api_key="0de6787ea7e44a8691ce4a5d556d18dc")

    #News extraction
    top_headlines = news.get_everything(
        q='+{} AND (quarantine OR corona OR covid OR lockdown) NOT positive NOT Deadlier NOT Dead'
        .format(area),
        from_param=duration,
        language='en')

    #sentiment analysis using Textblob

    newsfeed = sentiment(top_headlines, 0.0)
Пример #16
0
def search():
    k = request.args.get('k', '')
    f = request.args.get('f', '')
    t = request.args.get('t', '')
    c = request.args.get('c', '')
    s = request.args.get('s', '')
    print(k)
    print(f)
    print(t)
    print(c)
    print(s)
    if s == "all":
        s = ""

    newsapi = NewsApiClient(api_key='3061013219ce4282b5d26bdcf8b9f966')
    pageSize = 30
    results = newsapi.get_everything(q=k,
                                     sources=s,
                                     from_param=f,
                                     to=t,
                                     language='en',
                                     page_size=pageSize,
                                     sort_by='publishedAt')

    searchCount = 0
    searchData = []
    for i in results['articles']:
        if (searchCount != 15 and i["publishedAt"] and i["urlToImage"]
                and i["author"] and i["description"] and i["url"]
                and i["urlToImage"]):
            temp = i["source"]
            searchData.append(i["urlToImage"])
            searchData.append(i["title"])
            searchData.append(i["description"])
            searchData.append(i["author"])
            searchData.append(temp["name"])
            searchData.append(i["publishedAt"])
            searchData.append(i["url"])
            searchCount = searchCount + 1

    return jsonify({"results": searchData})
Пример #17
0
def news(request):
    newsapi = NewsApiClient(api_key='149dd9c6ff0c47cfae0d743f73171729')

    news = newsapi.get_everything(q='covid', language='en')
    articles = news['articles']

    desc = []
    news = []
    img = []
    url = []

    for article in articles:
        news.append(article['title'])
        desc.append(article['description'])
        img.append(article['urlToImage'])
        url.append(article['url'])

    newslist = zip(news, desc, img, url)
    context = {"newslist": newslist}

    return render(request, template_name='news.html', context=context)
Пример #18
0
class NewsSources:
    def __init__(self):
        self.db = DBManager()
        self.news_api = NewsApiClient(api_key=newsApiKey)

    def getNewsFromSources(self):
        newsData = self.news_api.get_everything(
            sources=self.db.getNewsApiSourcesIDs(),
            language=appLanguage,
            page_size=pageSize)['articles']
        res = {'articlesData': {}, 'articlesURLs': []}
        for article in newsData:
            articleDict = {
                'title': article['title'],
                'url': article['url'],
                'image': article['urlToImage'],
                'time': article['publishedAt'],
                'newsApiID': article['source']['id']
            }
            res['articlesData'][article['url']] = articleDict
            res['articlesURLs'].append(article['url'])
        return res
def get_news_articles(queries, fromdate, todate):
	fromdate = '2015-19-10'
	todate = '2018-20-10'
	newsapi = NewsApiClient(api_key='2b7935c2680f46b487d833129210d4c3')
	not_enough_articles = True
	articles_to_find = 5
	articles = []
	for query in queries:
		all_articles = newsapi.get_everything(q=query,
											  from_param=fromdate,
											  to=todate,
											  sort_by='relevancy',
											  page=1)
		articles_found = all_articles['articles']
		for found_article in articles_found:
			if found_article not in articles and len(articles) < articles_to_find:
				articles.append(found_article)
		if len(articles) == articles_to_find:
			articles = clean_articles(articles)
			return articles
	articles = clean_articles(articles)
	return articles #if all queries together do not return 5 different articles, return what is found
Пример #20
0
def get_news(topic):

    newsClient = NewsApiClient(api_key="bad068d6ce6c4ccfb30eb5785c360efe")
    #                                              q is search terms, category for category of news, language is english
    #                                              if possible (foreign news may not be english)
    keyWords = topic + " soccer"
    sportsSources = newsClient.get_sources(category="sports")
    sourceIds = ''
    for i in range(len(sportsSources['sources'])):
        if (i == len(sportsSources['sources']) - 1):
            sourceIds = sourceIds = sourceIds + sportsSources['sources'][i][
                'id']
        else:
            sourceIds = sourceIds + sportsSources['sources'][i]['id'] + ","
    threeDaysAgo = datetime.date(datetime.now()) - timedelta(
        3)  #date 3 days ago
    topHeadlines = newsClient.get_everything(q=keyWords,
                                             sources=sourceIds,
                                             language='en',
                                             sort_by='relevancy',
                                             from_param=threeDaysAgo)
    articles = topHeadlines['articles'][:3]
    return json.dumps(articles)
Пример #21
0
def html_table():
    df = pd.DataFrame()
    dj = pd.DataFrame()
    supplierdf = pd.read_excel('suppp.xlsx', headers=True)
    supplierdf['name'] = supplierdf['name'].str.strip()
    supplierlist = supplierdf['name'].values.tolist()
    supplierlist = [k.lower() for k in supplierlist]

    newsapi = NewsApiClient(api_key='b320e2b793644396bdbeded93ff9d702')

    for i in range(len(supplierlist)):
        all_articles = newsapi.get_everything(q=supplierlist[i], language='en')
        for article in all_articles['articles']:
            df = df.from_dict(article)
            df['Supplier Name'] = supplierlist[i]
            dj = dj.append(df)

    newsdf = dj.drop_duplicates(subset='title', keep="first")
    newsdf = newsdf.reset_index()
    newsdf = newsdf.drop(columns=['index'])
    return render_template('simple.html',
                           tables=[newsdf.to_html(classes='data')],
                           titles=newsdf.columns.values)
Пример #22
0
def get_newsurls(limit=5, query=None):
    """Returns list of news urls from newsapi.org."""

    # initialise newsapi
    KEY = 'key'  # insert your newsapi key
    newsapi = NewsApiClient(api_key=KEY)

    results = list()
    if query is None:  # get top headlines for Singapore news
        top_headlines_results = newsapi.get_top_headlines(language='en',
                                                          country='sg')
        results.extend(top_headlines_results['articles'][:limit])

    else:  # get search results from a query
        search_results = newsapi.get_everything(
            q=query,
            language='en',
            domains='channelnewsasia.com, todayonline.com, straitstimes.com')
        results.extend(search_results['articles'][:limit])

    urls = [article['url'] for article in results]

    return urls
    def collect_n_save_news(self):
        company = "Tesla"
        symbol = "TSLA"

        api_key = env("NEWSAPI_KEY")

        newsapi = NewsApiClient(api_key=api_key)

        start_time = self.rounded_to_the_last_30th_minute_epoch()
        end_time = start_time + timedelta(minutes=30)

        news = newsapi.get_everything(
            q=company,
            language="en",
            from_param=start_time,
            to=end_time,
            sort_by="popularity",
            page_size=100,
            page=1,
        )

        self.save_news_metrics(symbol, company, end_time,
                               {"news_count": len(news["articles"])})
Пример #24
0
def get_news ():
  API_KEY = "a621f645307c47129920cf7858d1dffe"
  newsapi = NewsApiClient(api_key=API_KEY)
  keywords = "covid19"
  all_articles = newsapi.get_everything(q=keywords, language="en")
  return all_articles["articles"]
Пример #25
0
class NewsApiHandle:
    """
    This class contains methods to handle everything related to the use of news api
    """
    def __init__(self, API_Key, keyword_list):
        """
        This method inititalizes the news api and also takes in a list of keywords as the argument and applies AND operation between them and queries it 
        and stores it in the variable response

        :type keyword_list: list
        :param keyword_list: list of keywords to query the api
        """

        #initialize news client with the api key
        self.news_api = NewsApiClient(api_key=API_Key)

        # the sting to be appended in the middle
        AND = " AND "

        # add AND in between the keywords in the list
        query_string = AND.join(keyword_list)

        # initialize an empty list of titles
        self.title_list = []

        # initialize an empty list of meta descriptions
        self.descriptions_list = []

        # initialize an empty list of Urls
        self.Urls_list = []

        # initialize an empty list of sources
        self.sources_list = []

        # query the api
        response = self.news_api.get_everything(q=query_string,
                                                sort_by='relevancy')

        # if the size of list_of_URLs is more then 5 set parse_length to 5 else according to its size
        parse_length = 5 if len(response['articles']) >= 5 else len(
            response['articles'])

        # for each article returned get the corresponding URL and append it to list_of_URLs
        for item in range(parse_length):

            # append every title to title list
            self.title_list.append(response["articles"][item]["title"])

            # append every description to description_list
            self.descriptions_list.append(
                response["articles"][item]["description"])

            # append every Urls to Urls_list
            self.Urls_list.append(response["articles"][item]["url"])

            # append every source to source list
            self.sources_list.append(
                response["articles"][item]['source']['name'])

    def get_URLs(self):
        """
        This method returns a maximum of 5 news Urls to extract content from (list)
        """
        return self.Urls_list

    def get_titles(self):
        """
        This method returns a maximum of 5 news titles (list)
        """
        return self.title_list

    def get_descriptions(self):
        """
        This method returns a maximum of 5 descriptions (list)
        """
        return self.descriptions_list

    def get_sources(self):
        """
        This method returns the source of 
        """
        return self.sources_list
Пример #26
0
class NewsURL:
    def __init__(self, start_date, end_date):
        self.API_KEY1 = '9382dd6539f448e59de4ab7c8c214f6f'  #김민수
        self.API_KEY2 = '08fe48df23494ab0bb4faa1162fee7fa'  #이명훈
        self.API_KEY3 = '0bc1cc3aff43418ba35488984b6742a4'  #최범석
        self.API_KEY4 = 'f996355abde44786b91bdef6bc92ee62'  #이명훈2
        self.API_KEY5 = '2533fbe4f09e4d9dbc51905dcd13d4a3'  #최범석2
        # Get the source
        self.tech_newsapi = NewsApiClient(api_key=self.API_KEY1)
        self.sources = self.tech_newsapi.get_sources()
        self.general_newsapi_1 = NewsApiClient(api_key=self.API_KEY2)
        self.general_newsapi_2 = NewsApiClient(api_key=self.API_KEY3)
        self.general_newsapi_3 = NewsApiClient(api_key=self.API_KEY4)
        self.google_newsapi = NewsApiClient(api_key=self.API_KEY5)
        # Make the magazine list
        self.general_magazine1 = [
            "ABC News", "Associated Press", "Business Insider", "CBS News",
            "CNN"
        ]
        self.general_magazine2 = [
            "Mashable", "NBC News", "The New York Times", "Reuters",
            "The Economist"
        ]
        self.general_magazine3 = [
            "The Washington Post", "The Washington Times", "Time", "USA Today"
        ]
        self.tech_magazine = [
            "Ars Technica", "Engadget", "Hacker News", "TechCrunch",
            "TechRader", "The Next Web", "The Verge", "Wired"
        ]
        self.today = datetime.date.today()
        self.start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        self.end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
        self.timedelta = int((self.end_date - self.start_date).days) + 1
        # company_list
        self.cor_list = pd.read_csv(
            './company_data/Company.csv')['Name'].tolist()
        if os.path.exists('./source/') == False:
            os.mkdir('./source')
        if os.path.exists('./source/{}'.format(
                self.today.strftime("%Y-%m-%d"))) == False:
            os.mkdir('./source/{}'.format(self.today.strftime("%Y-%m-%d")))
        if os.path.exists('./backup/') == False:
            os.mkdir('./backup')
        if os.path.exists('./backup/{}'.format(
                self.today.strftime("%Y-%m-%d"))) == False:
            os.mkdir('./backup/{}'.format(self.today.strftime("%Y-%m-%d")))
        print("news_crawler start! From: {}, to: {}, {}days".format(
            self.start_date.strftime("%Y-%m-%d"),
            self.end_date.strftime("%Y-%m-%d"), self.timedelta))

    # Get the magazine information
    def make_magazine(self, mode="tech"):
        if mode == "tech":
            magazine = []
            id_list = []
            for s in self.sources['sources']:
                if s['name'] in self.tech_magazine:
                    magazine.append(s)
            for m in magazine:
                id_list.append(m['id'])
        elif mode == "general":
            magazine_1 = list()
            magazine_2 = list()
            magazine_3 = list()
            general_magazine_dict = dict()
            for s in self.sources['sources']:
                if s['name'] in self.general_magazine1:
                    magazine_1.append(s)
                    general_magazine_dict['general_magazine1'] = magazine_1
                elif s['name'] in self.general_magazine2:
                    magazine_2.append(s)
                    general_magazine_dict['general_magazine2'] = magazine_2
                elif s['name'] in self.general_magazine3:
                    magazine_3.append(s)
                    general_magazine_dict['general_magazine3'] = magazine_3
            id_1 = list()
            id_2 = list()
            id_3 = list()
            id_list = dict()
            for gm in [
                    'general_magazine1', 'general_magazine2',
                    'general_magazine3'
            ]:
                print(gm)
                for m in general_magazine_dict[gm]:
                    if gm == 'general_magazine1':
                        id_1.append(m['id'])
                        id_list[gm] = id_1
                    elif gm == 'general_magazine2':
                        id_2.append(m['id'])
                        id_list[gm] = id_2
                    elif gm == 'general_magazine3':
                        id_3.append(m['id'])
                        id_list[gm] = id_3
        # Get the magazine id
        return id_list

    def make_tech_url_list(self):
        # newsapi.get_everything() parameters
        # q: Keywords or phrases to search for
        # sources: A comma-seperated string of identifiers (maximum 20) for the news
        # from: A date and optional time for the oldest article allowed. default: the oldest according to your plan
        # to: A date and optional time for the newest article allowed. default: the newest according to your plan
        # sort_by: The order to sort the articles in. Possible options: relevancy, popularity, publishedAt
        # page_size: The number of results to return per page. 20 is the default, 100 is the maxium
        # page: Use this to page through the results
        start_time = time.time()
        # Make the empty final data frame
        id_list = self.make_magazine(mode="tech")
        total_df = pd.DataFrame(
            columns=["Magazine", "Date", "Author", "Title", "Url"])
        for id in id_list:
            print(id)
            # Make the empty backup data frame
            backup_df = pd.DataFrame(
                columns=["Magazine", "Date", "Author", "Title", "Url"])
            for i in range(0, self.timedelta):
                date = self.start_date + datetime.timedelta(i)
                date = date.strftime("%Y-%m-%d")
                print(date)
                articles = self.tech_newsapi.get_everything(sources=id,
                                                            from_param=date,
                                                            to=date,
                                                            language="en",
                                                            page_size=100,
                                                            page=1)
                for a in articles['articles']:
                    total_df = total_df.append(
                        {
                            "Magazine": id,
                            "Date": a['publishedAt'],
                            "Author": a['author'],
                            "Title": a['title'],
                            "Url": a['url']
                        },
                        ignore_index=True)
                    backup_df = backup_df.append(
                        {
                            "Magazine": id,
                            "Date": a['publishedAt'],
                            "Author": a['author'],
                            "Title": a['title'],
                            "Url": a['url']
                        },
                        ignore_index=True)
            backup_df.to_csv("./backup/{0}/{0}_{1}.csv".format(
                self.today.strftime("%Y-%m-%d"), id),
                             index=False)
        total_df.to_csv("./source/{}/{}_techurl.csv".format(
            self.today.strftime("%Y-%m-%d"), self.today.strftime("%Y%m%d")),
                        index=False,
                        encoding='utf-8')
        end_time = time.time()
        return "success time:{}".format(end_time - start_time)

    def make_general_url_list(self):
        start_time = time.time()
        # newsapi.get_everything() parameters
        # q: Keywords or phrases to search for
        # sources: A comma-seperated string of identifiers (maximum 20) for the news
        # from_param: A date and optional time for the oldest article allowed. default: the oldest according to your plan
        # to: A date and optional time for the newest article allowed. default: the newest according to your plan
        # sort_by: The order to sort the articles in. Possible options: relevancy, popularity, publishedAt
        # page_size: The number of results to return per page. 20 is the default, 100 is the maxium
        # page: Use this to page through the results

        # Make the empty final data frame
        start_date = self.start_date.strftime("%Y-%m-%d")
        end_date = self.end_date.strftime("%Y-%m-%d")
        print("{}~{}".format(start_date, end_date))
        id_dict = self.make_magazine(mode="general")
        total_df = pd.DataFrame(
            columns=["Magazine", "Date", "Author", "Title", "Url", "Company"])
        for gm in [
                'general_magazine1', 'general_magazine2', 'general_magazine3'
        ]:
            id_list = id_dict[gm]
            if gm == 'general_magazine1':
                newsapi = self.general_newsapi_1
            elif gm == 'general_magazine2':
                newsapi = self.general_newsapi_2
            elif gm == 'general_magazine3':
                newsapi = self.general_newsapi_3
            for id in id_list:
                print("Magazine : ", id)
                # Make the empty backup data frame
                backup_df = pd.DataFrame(columns=[
                    "Magazine", "Date", "Author", "Title", "Url", "Company"
                ])
                for query in self.cor_list:
                    print(query)
                    articles = newsapi.get_everything(sources=id,
                                                      q=query,
                                                      from_param=start_date,
                                                      to=end_date,
                                                      language="en",
                                                      page_size=100,
                                                      page=1)
                    for a in articles['articles']:
                        total_df = total_df.append(
                            {
                                "Magazine": id,
                                "Date": a['publishedAt'],
                                "Author": a['author'],
                                "Title": a['title'],
                                "Url": a['url'],
                                "Company": query
                            },
                            ignore_index=True)
                        backup_df = backup_df.append(
                            {
                                "Magazine": id,
                                "Date": a['publishedAt'],
                                "Author": a['author'],
                                "Title": a['title'],
                                "Url": a['url'],
                                "Company": query
                            },
                            ignore_index=True)
                backup_df.to_csv("./backup/{0}/{0}_{1}.csv".format(
                    self.today.strftime("%Y-%m-%d"), id),
                                 index=False)
        total_df.to_csv("./source/{}/{}_genurl.csv".format(
            self.today.strftime("%Y-%m-%d"), self.today.strftime("%Y%m%d")),
                        index=False,
                        encoding='utf-8')
        end_time = time.time()
        return "success time:{}".format(end_time - start_time)

    # cralwer google_news url
    def make_google_url_list(self):
        start_time = time.time()
        # newsapi.get_everything() parameters
        # q: Keywords or phrases to search for
        # sources: A comma-seperated string of identifiers (maximum 20) for the news
        # from: A date and optional time for the oldest article allowed. default: the oldest according to your plan
        # to: A date and optional time for the newest article allowed. default: the newest according to your plan
        # sort_by: The order to sort the articles in. Possible options: relevancy, popularity, publishedAt
        # page_size: The number of results to return per page. 20 is the default, 100 is the maxium
        # page: Use this to page through the results

        # Make the empty final data frame
        start_date = self.start_date.strftime("%Y-%m-%d")
        end_date = self.end_date.strftime("%Y-%m-%d")
        print("{}~{}".format(start_date, end_date))
        total_df = pd.DataFrame(
            columns=["Magazine", "Date", "Author", "Title", "Url"])
        for query in self.cor_list:
            print(query)
            articles = self.google_newsapi.get_everything(
                sources='google-news',
                q=query,
                from_param=start_date,
                to=end_date,
                language="en",
                page_size=100,
                page=1)
            print(len(articles['articles']))
            for a in articles['articles']:
                total_df = total_df.append(
                    {
                        "Magazine": "google_news",
                        "Date": a['publishedAt'],
                        "Author": a['author'],
                        "Title": a['title'],
                        "Url": a['url']
                    },
                    ignore_index=True)
        total_df.to_csv("./source/{0}/{0}_googleurl.csv".format(
            self.today.strftime("%Y%m%d")),
                        index=False,
                        encoding='utf-8')
        end_time = time.time()
        return "success time:{}".format(end_time - start_time)
from newsapi.newsapi_client import NewsApiClient
import json
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, SentimentOptions

authenticator = IAMAuthenticator('cIcEmIFE4K73r7kGwzxbR_M-x1peReu3DM9o0WfgcdlO')
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    authenticator=authenticator
)
newsapi=NewsApiClient(api_key="c650b6441dd24b2991ec29a1fd13e76c")
natural_language_understanding.set_service_url('https://api.eu-de.natural-language-understanding.watson.cloud.ibm.com/instances/9d7674b4-e3b9-483d-8d45-be07ba05fc72')
news=newsapi.get_everything(q='Politics',language='en')
for i in news['articles']:
    response=natural_language_understanding.analyze(
        url=i['url'],features=Features(sentiment=SentimentOptions(document=True,targets=None))).get_result()
    print(json.dumps(response, indent=2))
Пример #28
0
from newsapi.newsapi_client import NewsApiClient
import pickle
import pandas as pd
from spacy.lang.en import punctuation
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

nlp_eng = en_core_web_lg.load()
newsapi = NewsApiClient(api_key='753eb8f70f404f9e8a1a0cb4cce69b2d')
articles = []

for i in range(1, 6):
    temp = newsapi.get_everything(q='coronavirus',
                                  language='en',
                                  from_param='2021-02-25',
                                  to='2021-03-23',
                                  sort_by='relevancy',
                                  page=i)
    articles.append(temp)
# print(articles)

filename = 'articlesCOVID.pck1'
pickle.dump(articles, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))

filepath = 'D:/CPP/Junior Year/Spring 2021/CS 4650/Homework/HW5/articlesCOVID.pck1'
pickle.dump(loaded_model, open(filepath, 'wb'))

dados = []

for i, article in enumerate(articles):
Пример #29
0
class Newsy:

    def __init__(self):

        self.root = tk.Tk()
        self.root.geometry("1920x1080")
        self.root.title("News App")

        self.newsapi = NewsApiClient(api_key=API_KEY)

        self.top_headlines = tk.StringVar()
        self.all_articles = tk.StringVar()

        self.query = None

        self.createAndDisplay()

    def exception_handler(func):

        def wrapper(self, *args, **kwargs):

            try:

                return func(self, *args, **kwargs)

            except:

                self.articles_list.config(
                    text=("There was error in processing your request."))

        return wrapper

    @exception_handler
    def createAndDisplay(self):

        self.articles_list = tk.Label(
            self.root, text="", font=('Helvetica', 15))
        self.articles_list.place(
            relx=0, rely=0.25, relheight='0.7', relwidth='1')

        title = tk.Label(self.root, text="News App", font=('Modern', 40))
        title.place(relx=0.6, rely=0.01)

        top_headlines = self.newsapi.get_top_headlines(
            language='en', country='in')
        all_articles = self.newsapi.get_everything(sources='bbc-news,the-verge', domains='bbc.co.uk,techcrunch.com',
                                                   language='en', sort_by='relevancy', page=5)

        top_headlines = [news['url'] for news in top_headlines['articles']]
        all_articles = [news['url'] for news in all_articles['articles']]

        drop = tk.OptionMenu(self.root, self.top_headlines, *top_headlines)
        drop.place(relx=0, rely=0)

        drop2 = tk.OptionMenu(self.root, self.all_articles, *all_articles)
        drop2.place(relx=0, rely=0.1)

        url_button = tk.Button(self.root, text="Select", bd=1,
                               activebackground="#ffee96", command=self.get_url)
        url_button.place(relx=0.8, rely=0)

        headline = tk.Button(self.root, text="India News", bd=1,
                             activebackground="#ffee96", command=self.get_headlines)
        headline.place(relx=0.3, rely=0.2)

        all_news = tk.Button(self.root, text="World News", bd=1,
                             activebackground="#ffee96", command=self.get_news)
        all_news.place(relx=0.4, rely=0.2)

        self.query = tk.Entry(self.root, font=('Courier', 10), bd=0)
        self.query.place(relx=0.5, rely=0.2)

        search = tk.Button(self.root, text="Search", bd=1,
                           activebackground="#ffee96", command=self.search_query)
        search.place(relx=0.65, rely=0.2)

    @exception_handler
    def get_url(self):

        webbrowser.open(self.top_headlines.get())

        webbrowser.open(self.all_articles.get())

    @exception_handler
    def get_headlines(self):

        articles = self.newsapi.get_top_headlines(
            language='en', country='in')

        self.articles_list.config(
            text=(" ".join([news['title'] + "\n" for news in articles['articles']])))

    @exception_handler
    def get_news(self):

        articles = self.newsapi.get_everything(sources='bbc-news,the-verge', domains='bbc.co.uk,techcrunch.com',
                                               language='en', sort_by='relevancy', page=5)

        self.articles_list.config(text=" ".join(
            [news['title'] + "\n" for news in articles['articles']]))

    @exception_handler
    def search_query(self):

        articles = self.newsapi.get_everything(
            q=self.query.get(), language='en')

        self.articles_list.config(
            text=(" ".join([news['title'] + "\n" for news in articles['articles']])))
# https://newsapi.org/docs/client-libraries/python

from newsapi.newsapi_client import NewsApiClient

apikey = 'eda663f8-e934-42a5-88e2-bd75014130d1'

newsapi = NewsApiClient(api_key=apikey)

target_lst = [
    'museums', 'united ways', 'development and relief services',
    'advocacy and education', 'children and family services'
]

lst = []
for topic in target_lst():
    res = all_articles = newsapi.get_everything(q=topic,
                                                language='en',
                                                sort_by='relevancy',
                                                page=2)
    lst.extend(res)

print(lst)