Пример #1
0
def get_article_list(page, count, source_title, keywords):
    q = eventregistry.QueryArticlesIter(lang="eng",
                                        sourceUri=get_source_uri(source_title),
                                        dateEnd=datetime.datetime.now(),
                                        keywords=keywords,
                                        keywordsLoc="title")
    q.setRequestedResult(
        eventregistry.RequestArticlesInfo(
            page=page,
            count=count,
            returnInfo=eventregistry.ReturnInfo(
                articleInfo=eventregistry.ArticleInfoFlags(
                    body=False, categories=False, image=True, videos=False))))
    res = er.execQuery(q)
    l = ArticleList()
    article_detail_list = []
    for article in res['articles']['results']:
        a = ArticleDetail()
        a.uri = article['uri']
        a.title = article['title']
        a.source = article['source']['title']
        a.imageUrl = article['image'] if article['image'] else ''
        a.time = article['dateTime']
        article_detail_list.append(a)
    l.articles.extend(article_detail_list)
    return l
Пример #2
0
def get_elon_headlines():
    ''' Pull headlines for Elon'''
    event_reg = authenticate()
    concept = er.QueryArticlesIter(keywords="Elon Musk")
    return [
        news['title'] for news in concept.execQuery(
            event_reg, sortBy="socialScore", maxItems=10)
    ]
    def get_articles(self,
                     keywords=None,
                     concepts=None,
                     categories=None,
                     sources=None,
                     languages=None,
                     date_start=None,
                     date_end=None,
                     sort_by="date",
                     sort_by_asc=False,
                     max_items=-1,
                     save_to_file=None,
                     save_format=None,
                     verbose=False):
        """Get the event registry articles

        Args:
            keywords (list(str)): The list of keywords the articles
                should contain (Default: None).
            concepts (list(str)): The list of concepts the articles
                should contain (Default: None).
            categories (list(str)): The list of categories the articles
                should be in (Default: None).
            sources (list(str)): The list of sources from which to
                retrieve the articles (Default: None).
            language (list(str)): The list of languages the articles
                should be written in (Default: None).
            date_start (date): The start date from which the articles
                should be acquired.If None, it starts from the first
                date supported by Event Registry (Default: None).
            date_end (date): The end date until which the articles
                should be acquired. If None, it ends at the day of
                collecting (Default: None).
            sort_by (str): The sorting attribute, see https://github.com/EventRegistry/event-registry-python/wiki/Searching-for-articles
                (Default: 'date').
            sort_by_asc (bool): If the documents should be sorted in
                ascending (True) or descending order (False) (Default: False).
            max_items (int): The maximum number of articles to retrieve,
                where -1 means return all matching articles (Default: -1).
            save_to_file (str): The path to which we wish to store the articles.
                If None, the articles are not stored. In addition, if the same
                file is used for multiple queries, the new articles will be
                appended to the existing ones (Default: None).
            save_format (str): The format in which the articles are stored. (Default: None)
                Options:
                    'array' - The articles are wrapped into an array. Should not
                        be used when storing query results into the same file.
                    None - The articles are stored line-by-line in the file.

        Returns:
            Iterator: The iterator which goes through all retrieved articles.

        """
        # setup the event registry parameters
        er_keywords = ER.QueryItems.AND(keywords) if keywords else None
        er_concepts = (ER.QueryItems.AND(
            [c.get_uri()
             for c in self.get_concepts(concepts)]) if concepts else None)
        er_categories = (ER.QueryItems.AND([
            c.get_uri() for c in self.get_categories(categories)
        ]) if categories else None)
        er_sources = (ER.QueryItems.OR(
            [c.get_uri()
             for c in self.get_sources(sources)]) if sources else None)
        er_lang = ER.QueryItems.OR(languages) if languages else None

        # when saving to file check the last date and use it as start date
        if save_to_file and os.path.isfile(save_to_file):
            with open(save_to_file) as in_file:
                # get all lines
                lines = in_file.readlines()
                if len(lines) > 0:
                    last_article = json.loads(lines[-1])
                    # check if last event in right location
                    date_start = last_article["date"]

        if verbose:
            print_query_params({
                "keywords": er_keywords,
                "concepts": er_concepts,
                "categories": er_categories,
                "sources": er_sources,
                "date_start": date_start,
                "date_end": date_end,
                "languages": languages
            })

        # creates the query articles object
        q = ER.QueryArticlesIter(
            keywords=er_keywords,
            conceptUri=er_concepts,
            categoryUri=er_categories,
            sourceUri=er_sources,
            dateStart=date_start,
            dateEnd=date_end,
            lang=er_lang,
        )

        # execute the query and return the iterator
        articles = q.execQuery(self._er,
                               sortBy=sort_by,
                               sortByAsc=sort_by_asc,
                               maxItems=max_items)

        if save_to_file:
            save_result_in_file(articles, save_to_file, save_format)

        # return the articles for other use
        return articles
Пример #4
0
# Set maximum number of articles per day
number_of_articles = 50

# DEFINE df results columns
result = dict()

for company in companies:
    print("- Starting article processing for company :", company)
    # Dictionary
    result.update({company:{}})
    for day in time_frame:
        # QUERY articles related to current company
        print("-- Start article processing for Date: ", day)

        result[company].update({day.strftime('%Y-%m-%d'): []})
        q = ER.QueryArticlesIter(conceptUri=er.getConceptUri(company), lang="eng", dateStart=day.date(),
                                 dateEnd=day.date())
        articles = q.execQuery(er, sortBy=["date", "sourceImportance"], sortByAsc=False, lang=["eng"],
                               returnInfo=ReturnInfo(
                                   articleInfo=ArticleInfoFlags(socialScore=True, originalArticle=True, categories=True,
                                                                concepts=True, sentiment=True, duplicateList=True)),
                               maxItems=number_of_articles, articleBatchSize=50)



        # Iterate over all articles about the current company
        # Calculate Sentiment and save in day`s column and index
        while True:
            try:
                article = next(articles)
            except AssertionError:
                print("Article throws assertion error!")