예제 #1
0
class Scraper:

    # example code
    # -----------------------
    # x = Scraper(api_key='xyz')
    # print(x.scrape_all_articles(language='en'))

    articles = None
    sources = None
    api_key = None

    def __init__(self, api_key) -> None:
        super().__init__()
        self.api_key = api_key
        self.articles = Articles(API_KEY=self.api_key)
        self.sources = Sources(API_KEY=self.api_key)

    def scrape_articles_for_sources(self, sources):
        '''
        Accepts the list of source names and returns all articles downloaded from the given sources
        :param sources: List of source id's
        :return: List of article json objects, containing:
            'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt'
        '''
        articles = []
        for source in sources:
            try:
                # list of json objects
                # author, title, description, url, urlToImage, publishedAt
                articles_for_source = self.articles.get(source=source).articles
            except BaseException:  # if the server does not respond
                continue
            for article in articles_for_source:
                articles.append(article)
        return articles

    def scrape_sources(self, categories=[], language=None):
        '''
        Gets the newsapi sources associated with the given category (optional) and language (optional)
        :param categories: List of categories (optional)
        :param language: Language (optional)
        :return: List of source id's
        '''
        sources_dict = []
        for category in categories:
            sources_dict += self.sources.get(category, language).sources
        sources = set([source['id'] for source in sources_dict])
        return sources

    def scrape_all_articles(self, categories=[], language=None):
        '''
        Scrapes and returns all articles for the given category and language (parameters are optional)
        :param categories: list of categories (optional)
        :param language: language (optional)
        :return: List of article json objects, containing:
            'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt'
        '''
        return self.scrape_articles_for_sources(
            self.scrape_sources(categories, language))
예제 #2
0
def news():
    with open('X.pkl', 'rb') as f:
        X = pickle.load(f)
    with open('y.pkl', 'rb') as f:
        y = pickle.load(f)


#Generating the training and testing dataset

    count_vectorizer = CountVectorizer()
    X = count_vectorizer.fit_transform(X)  # Fit the Data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.0,
                                                        random_state=42)
    #Naive Bayes Classifier
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    apikey = 'c9c0b7a1fc944a02bdadda8c09dace91'
    a = Articles(API_KEY=apikey)
    data = a.get(source="abc-news-au", sort_by='top')
    data = pd.DataFrame.from_dict(data)
    data = pd.concat(
        [data.drop(['articles'], axis=1), data['articles'].apply(pd.Series)],
        axis=1)
    description = data['description']

    def pre(x):
        data1 = str(x)
        data1 = remove_new_lines(data1)
        data1 = remove_stop_words(data1)
        data1 = strip(data1)
        data1 = remove_weird(data1)
        data1 = np.array(data1).reshape(-1)
        vect = count_vectorizer.transform(data1)
        my_prediction1 = clf.predict(vect)
        return my_prediction1

    pred0 = pre(description[0])
    pred1 = pre(description[1])
    pred2 = pre(description[2])
    pred3 = pre(description[3])
    pred4 = pre(description[4])

    return render_template('news.html',
                           des0=description[0],
                           des1=description[1],
                           des2=description[2],
                           des3=description[3],
                           des4=description[4],
                           pred0=pred0,
                           pred1=pred1,
                           pred2=pred2,
                           pred3=pred3,
                           pred4=pred4)
예제 #3
0
def get_news(sources):
    NEWS_API_KEY='1bae2e39f2b540f3a15dbbcb269eba9b'
    articles=Articles(API_KEY=NEWS_API_KEY)
    info=articles.get(source=sources)
    news_array=[]
    news_objects=[]
    length_of_articles=len(info['articles'])
    for i in range(0,int(length_of_articles)):
        headline=info['articles'][i]['title']
        body=info['articles'][i]['description']
        url_web=info['articles'][i]['url']
        image=info['articles'][i]['urlToImage']
        time=info['articles'][i]['publishedAt']
        news_objects.append(Template.GenericElement(title=headline,subtitle=body,item_url=url_web,
        image_url=image,buttons=[Template.ButtonWeb(title='Open in web',url=url_web)]))
    return news_objects
from newsapi.articles import Articles
from newsapi.sources import Sources
key = '96af62a035db45bda517a9ca62a25ac3'
a, s = Articles(API_KEY=key), Sources(API_KEY=key)
s.all()  # get all sources offered by newsapi

a.get(source='the-new-york-times')
s.get(category='technology', language='en', country='US')

from newsapi import NewsAPI

key = '96af62a035db45bda517a9ca62a25ac3'
params = {}
api = NewsAPI(key)
sources = api.sources(params)
articles = api.articles(sources[0]['id'], params)

################ NY Times API #############################################

import sys, csv, json
reload(sys)
sys.setdefaultencoding('utf8')
"""
About:
Python wrapper for the New York Times Archive API 
https://developer.nytimes.com/article_search_v2.json
"""


class APIKeyException(Exception):
    def __init__(self, message):
예제 #5
0
import newsapi
import requests
import json
import os

from newsapi.articles import Articles
from newsapi.sources import Sources

a = Articles(API_KEY="537b165a4f314fedae8cb39788d4d713")
s = Sources(API_KEY="537b165a4f314fedae8cb39788d4d713")

res = a.get(source="daily-mail")['articles']
bbc = a.get(source="bbc-news")['articles']
telegraph = a.get(source="the-telegraph")['articles']
guardian = a.get(source="the-guardian-uk")['articles']
independent = a.get(source="independent")['articles']
sports = a.get(source="the-sport-bible")['articles']

# results = s.get_by_country("gb").sources
# # s.get_by_category("politics")

#resultsString = ''.join(str(e) for e in results)

# filename = 'news_stream.py'

# with open(filename, 'a') as file:
#     for result in independent:
#         print(result['title'])
#         # If you want other things from the tweet object you can specify it here
#         file.write(result['title'] + os.linesep)
예제 #6
0
class ReporterModule(BaseModule):
    AFFIRMATIVE = ["YES", "YEAH", "SURE", "YAH", "YA"]
    NEGATIVE = ["NO", "NEGATIVE", "NAH", "NA", "NOPE"]

    def __init__(self, *args):
        super(ReporterModule, self).__init__(*args)
        self.API_KEY = self.get_configuration("newsapi.org_key")
        self.threshold = int(self.get_configuration("news_limit"))
        if self.API_KEY:
            self.articles = Articles(self.API_KEY)
            self.sources = Sources(self.API_KEY)
        else:
            print(
                "Kindly look back at the documentation to configure news module properly especially the API keys."
            )
            return False
        self.sources_url = {}
        self.sources.information()

    def get_all_categories(self):
        return list(self.sources.all_categories())

    def get_by_category(self, category):
        srcs = self.sources.get_by_category(category).sources
        self.sources_url = {}
        for src in srcs:
            self.sources_url[src['name']] = src['url']
        return self.sources_url

    def get_sort_bys_of_source(self, source_name):
        return self.sources.search(source_name)[0]['sortBysAvailable']

    def all_sources(self):
        self.sources_url = self.sources.all_names()
        return self.sources_url

    def get_news(self):
        self.assistant.say(
            "Would you prefer any specific category? If yes then what would it be?"
        )
        category_status = self.assistant.listen().decipher()
        if category_status.upper() in self.NEGATIVE:
            category = False
        else:
            categories = self.get_all_categories()
            category = self.search(categories, category_status)
        self.assistant.say(
            "Any preference you would like to have about source of your news? like CNN"
            "or Time magazine or maybe The hindu?")
        source_status = self.assistant.listen().decipher()
        if source_status.upper() in self.NEGATIVE:
            source = False
        else:
            if category:
                sources_available = self.get_by_category(category)
                response = "Out of all the sources as follows"
                for source_name, source_url in sources_available.items():
                    response += " %s," % source_name
                response += ", which one would you like to pick?"
                self.assistant.say(response)
                source_command = self.assistant.listen().decipher()
                source = self.search(list(sources_available), source_command)
            else:
                self.assistant.say(
                    "So would you want me to list all the sources around 70 which to be"
                    "honest would be a hefty task, so if not, then just let me know of"
                    "your source name and I would let you know if it's available or not."
                )
                all_sources_status = self.assistant.listen().decipher()
                sources_available = self.all_sources()
                if all_sources_status.upper() in self.AFFIRMATIVE:
                    response = "Good job, lazy ass, so here are all the available sources as follows "
                    sources_available_list = list(sources_available)
                    for source_name in sources_available_list:
                        response += " %s," % source_name
                    response += ", which one would you like to pick?"
                    self.assistant.say(response)
                    source_command = self.assistant.listen().decipher()
                    all_sources_status = source_command
                source_found = self.search(list(sources_available),
                                           all_sources_status)
                source = source_found
        if source:
            sort_bys_available = self.get_sort_bys_of_source(source)
            if len(sort_bys_available) == 1:
                sort_by = sort_bys_available[0]
            else:
                if len(sort_bys_available) == 2:
                    response = "And what kind of news sort would you like? " \
                               "%s or %s?" % (sort_bys_available[0], sort_bys_available[1])
                else:
                    response = "And what kind of news sort would you like? " \
                               "%s or %s, or maybe %s?" % (sort_bys_available[0],
                                                           sort_bys_available[1],
                                                           sort_bys_available[2])
                self.assistant.say(response)
                sort_by_command = self.assistant.listen().decipher()
                sort_by = self.search(sort_bys_available, sort_by_command)
        else:
            self.assistant.say("And what kind of news sort would you like?"
                               "latest or maybe top ones shown in front page?")
            sort_status_command = self.assistant.listen().decipher()
            sort_by = self.search(['top', 'popular'
                                   'latest'], sort_status_command)
        if not source:
            if sort_by.lower() == "top":
                source = "google-news"
            elif sort_by.lower() == "latest":
                source = "the-telegraph"
            else:
                source = "time"
        response = self.get_response(source, sort_by)
        return response

    def handle(self):
        source = self.get_configuration("news_source")
        response = self.get_response(source)
        return response

    def get_response(self, source, sort_by=None, threshold=5):
        if self.threshold:
            threshold = self.threshold
        source = source.lower().replace(" ", "-")
        articles = self.articles.get(source, sort_by=sort_by).articles
        articles = articles[:threshold]
        response = "So the %s news from %s news source are as follows " % (
            sort_by, source)
        for article in articles:
            if article['title']:
                response += "%s, " % article['title']
            if article['description']:
                response += "%s, " % article['description']
            if article['author']:
                response += "was reported by %s." % article['author']
            response += "and in the other news. "
        return response

    @staticmethod
    def search(dataset, query):
        values = [0 for _ in range(0, len(dataset))]
        search = query.lower().split()
        upper_threshold = len(search)
        for index, data in enumerate(dataset):
            search_array = data.split()
            for index2, text in enumerate(search_array):
                if index2 >= upper_threshold:
                    break
                threshold = len(search[index2])
                for i in range(0, len(text)):
                    if i >= threshold - 1:
                        break
                    if text[i] == search[index2][i]:
                        values[index] += 1
        max_value = max(values)
        max_index = values.index(max_value)
        return dataset[max_index]
예제 #7
0
class ReporterModule(BaseModule):
    AFFIRMATIVE = ["YES", "YEAH", "SURE", "YAH", "YA"]
    NEGATIVE = ["NO", "NEGATIVE", "NAH", "NA", "NOPE"]

    def __init__(self, *args):
        super(ReporterModule, self).__init__(*args)
        self.API_KEY = self.get_configuration("newsapi.org_key")
        self.threshold = int(self.get_configuration("news_limit"))
        if self.API_KEY:
            self.articles = Articles(self.API_KEY)
            self.sources = Sources(self.API_KEY)
        else:
            print(_("error.news.configuration"))
            return False
        self.sources_url = {}
        self.sources.information()

    def get_all_categories(self):
        return list(self.sources.all_categories())

    def get_by_category(self, category):
        srcs = self.sources.get_by_category(category).sources
        self.sources_url = {}
        for src in srcs:
            self.sources_url[src['name']] = src['url']
        return self.sources_url

    def get_sort_bys_of_source(self, source_name):
        return self.sources.search(source_name)[0]['sortBysAvailable']

    def all_sources(self):
        self.sources_url = self.sources.all_names()
        return self.sources_url

    def get_news(self):
        self.assistant.say(_("news.category.ask"))
        category_status = self.assistant.listen().decipher()
        if category_status.upper() in self.NEGATIVE:
            category = False
        else:
            categories = self.get_all_categories()
            category = self.search(categories, category_status)
        self.assistant.say(_("news.sources.ask"))
        source_status = self.assistant.listen().decipher()
        if source_status.upper() in self.NEGATIVE:
            source = False
        else:
            if category:
                sources_available = self.get_by_category(category)
                response = _("news.sources.list")
                for source_name, source_url in sources_available.items():
                    response += " %s," % source_name
                response += _("news.sources.select")
                self.assistant.say(response)
                source_command = self.assistant.listen().decipher()
                source = self.search(list(sources_available), source_command)
            else:
                self.assistant.say(_("news.sources.all.ask"))
                all_sources_status = self.assistant.listen().decipher()
                sources_available = self.all_sources()
                if all_sources_status.upper() in self.AFFIRMATIVE:
                    response = _("news.sources.all")
                    sources_available_list = list(sources_available)
                    for source_name in sources_available_list:
                        response += " %s," % source_name
                    response += _("news.sources.select")
                    self.assistant.say(response)
                    source_command = self.assistant.listen().decipher()
                    all_sources_status = source_command
                source_found = self.search(list(sources_available), all_sources_status)
                source = source_found
        if source:
            sort_bys_available = self.get_sort_bys_of_source(source)
            if len(sort_bys_available) == 1:
                sort_by = sort_bys_available[0]
            else:
                if len(sort_bys_available) == 2:
                    response = _("news.sort.two_options").format(sort_bys_available[0], sort_bys_available[1])
                else:
                    response = _("news.sort.three_options").format(
                        sort_bys_available[0],
                        sort_bys_available[1],
                        sort_bys_available[2],
                    )
                self.assistant.say(response)
                sort_by_command = self.assistant.listen().decipher()
                sort_by = self.search(sort_bys_available, sort_by_command)
        else:
            self.assistant.say(_("news.sort.described_options"))
            sort_status_command = self.assistant.listen().decipher()
            sort_by = self.search(['top', 'popular' 'latest'], sort_status_command)
        if not source:
            if sort_by.lower() == "top":
                source = "google-news"
            elif sort_by.lower() == "latest":
                source = "the-telegraph"
            else:
                source = "time"
        response = self.get_response(source, sort_by)
        return response

    def handle(self):
        source = self.get_configuration("news_source")
        response = self.get_response(source)
        return response

    def get_response(self, source, sort_by=None, threshold=5):
        if self.threshold:
            threshold = self.threshold
        source = source.lower().replace(" ", "-")
        articles = self.articles.get(source, sort_by=sort_by).articles
        articles = articles[:threshold]
        response = _("news.report").format(sort_by, source)
        for article in articles:
            if article['title']:
                response += "%s, " % article['title']
            if article['description']:
                response += "%s, " % article['description']
            if article['author']:
                response += _("news.report.by").format(article['author'])
            response += _("news.report.continue")
        return response

    @staticmethod
    def search(dataset, query):
        values = [0 for _ in range(0, len(dataset))]
        search = query.lower().split()
        upper_threshold = len(search)
        for index, data in enumerate(dataset):
            search_array = data.split()
            for index2, text in enumerate(search_array):
                if index2 >= upper_threshold:
                    break
                threshold = len(search[index2])
                for i in range(0, len(text)):
                    if i >= threshold - 1:
                        break
                    if text[i] == search[index2][i]:
                        values[index] += 1
        max_value = max(values)
        max_index = values.index(max_value)
        return dataset[max_index]
예제 #8
0
import newsapi
import numpy
import pandas as pd
from newsapi.articles import Articles

apikey = '455e01c84ca44ff387187f10f202bed3'
a = Articles(API_KEY=apikey)
data = a.get(source="the-new-york-times", sort_by='top')

#print (data) ## raw news data

## -----------------------------------------------------------

data = pd.DataFrame.from_dict(data)
data = pd.concat(
    [data.drop(['articles'], axis=1), data['articles'].apply(pd.Series)],
    axis=1)

#data.head()

# drop unused columns
# display only title and discription

news_df = data.drop(columns=[
    'status', 'source', 'sortBy', 'author', 'url', 'urlToImage', 'publishedAt'
])

#print(news_df)

print("---------------------------------------------------------------------")
print("---------------------------------------------------------------------")