def get_article_list(page, count, source_title, keywords): q = eventregistry.QueryArticlesIter(lang="eng", sourceUri=get_source_uri(source_title), dateEnd=datetime.datetime.now(), keywords=keywords, keywordsLoc="title") q.setRequestedResult( eventregistry.RequestArticlesInfo( page=page, count=count, returnInfo=eventregistry.ReturnInfo( articleInfo=eventregistry.ArticleInfoFlags( body=False, categories=False, image=True, videos=False)))) res = er.execQuery(q) l = ArticleList() article_detail_list = [] for article in res['articles']['results']: a = ArticleDetail() a.uri = article['uri'] a.title = article['title'] a.source = article['source']['title'] a.imageUrl = article['image'] if article['image'] else '' a.time = article['dateTime'] article_detail_list.append(a) l.articles.extend(article_detail_list) return l
def get_elon_headlines(): ''' Pull headlines for Elon''' event_reg = authenticate() concept = er.QueryArticlesIter(keywords="Elon Musk") return [ news['title'] for news in concept.execQuery( event_reg, sortBy="socialScore", maxItems=10) ]
def get_articles(self, keywords=None, concepts=None, categories=None, sources=None, languages=None, date_start=None, date_end=None, sort_by="date", sort_by_asc=False, max_items=-1, save_to_file=None, save_format=None, verbose=False): """Get the event registry articles Args: keywords (list(str)): The list of keywords the articles should contain (Default: None). concepts (list(str)): The list of concepts the articles should contain (Default: None). categories (list(str)): The list of categories the articles should be in (Default: None). sources (list(str)): The list of sources from which to retrieve the articles (Default: None). language (list(str)): The list of languages the articles should be written in (Default: None). date_start (date): The start date from which the articles should be acquired.If None, it starts from the first date supported by Event Registry (Default: None). date_end (date): The end date until which the articles should be acquired. If None, it ends at the day of collecting (Default: None). sort_by (str): The sorting attribute, see https://github.com/EventRegistry/event-registry-python/wiki/Searching-for-articles (Default: 'date'). sort_by_asc (bool): If the documents should be sorted in ascending (True) or descending order (False) (Default: False). max_items (int): The maximum number of articles to retrieve, where -1 means return all matching articles (Default: -1). save_to_file (str): The path to which we wish to store the articles. If None, the articles are not stored. In addition, if the same file is used for multiple queries, the new articles will be appended to the existing ones (Default: None). save_format (str): The format in which the articles are stored. (Default: None) Options: 'array' - The articles are wrapped into an array. Should not be used when storing query results into the same file. None - The articles are stored line-by-line in the file. Returns: Iterator: The iterator which goes through all retrieved articles. """ # setup the event registry parameters er_keywords = ER.QueryItems.AND(keywords) if keywords else None er_concepts = (ER.QueryItems.AND( [c.get_uri() for c in self.get_concepts(concepts)]) if concepts else None) er_categories = (ER.QueryItems.AND([ c.get_uri() for c in self.get_categories(categories) ]) if categories else None) er_sources = (ER.QueryItems.OR( [c.get_uri() for c in self.get_sources(sources)]) if sources else None) er_lang = ER.QueryItems.OR(languages) if languages else None # when saving to file check the last date and use it as start date if save_to_file and os.path.isfile(save_to_file): with open(save_to_file) as in_file: # get all lines lines = in_file.readlines() if len(lines) > 0: last_article = json.loads(lines[-1]) # check if last event in right location date_start = last_article["date"] if verbose: print_query_params({ "keywords": er_keywords, "concepts": er_concepts, "categories": er_categories, "sources": er_sources, "date_start": date_start, "date_end": date_end, "languages": languages }) # creates the query articles object q = ER.QueryArticlesIter( keywords=er_keywords, conceptUri=er_concepts, categoryUri=er_categories, sourceUri=er_sources, dateStart=date_start, dateEnd=date_end, lang=er_lang, ) # execute the query and return the iterator articles = q.execQuery(self._er, sortBy=sort_by, sortByAsc=sort_by_asc, maxItems=max_items) if save_to_file: save_result_in_file(articles, save_to_file, save_format) # return the articles for other use return articles
# Set maximum number of articles per day number_of_articles = 50 # DEFINE df results columns result = dict() for company in companies: print("- Starting article processing for company :", company) # Dictionary result.update({company:{}}) for day in time_frame: # QUERY articles related to current company print("-- Start article processing for Date: ", day) result[company].update({day.strftime('%Y-%m-%d'): []}) q = ER.QueryArticlesIter(conceptUri=er.getConceptUri(company), lang="eng", dateStart=day.date(), dateEnd=day.date()) articles = q.execQuery(er, sortBy=["date", "sourceImportance"], sortByAsc=False, lang=["eng"], returnInfo=ReturnInfo( articleInfo=ArticleInfoFlags(socialScore=True, originalArticle=True, categories=True, concepts=True, sentiment=True, duplicateList=True)), maxItems=number_of_articles, articleBatchSize=50) # Iterate over all articles about the current company # Calculate Sentiment and save in day`s column and index while True: try: article = next(articles) except AssertionError: print("Article throws assertion error!")