def discover_trendingconcepts(self, top=20): ## api request GetTrendingConcepts q = evr.GetTrendingConcepts(source="news", count=top) q = evr.GetTrendingConcepts( source="pr", count=top, conceptType=['org'], returnInfo=evr.ReturnInfo(conceptInfo=evr.ConceptInfoFlags( trendingHistory=True))) ret = api.execQuery(q) ## create dic_concepts dic_concepts = { concept['uri']: concept['trendingHistory']['news'] for concept in ret } print(dic_concepts.keys()) ## api request QueryEvents lst_events = [] for concept_uri in dic_concepts: print(concept_uri) q = evr.QueryEvents(conceptUri=concept_uri) q.setRequestedResult(evr.RequestEventsInfo(sortBy="rel", count=1)) event = self.api.execQuery(q)['events']['results'] if event: event = event[0] event.update({ 'search_term': concept_uri.replace('http://en.wikipedia.org/wiki/', '') }) event.update({'trendingHistory': dic_concepts[concept_uri]}) lst_events.append(event) return dtf_concepts
def get_article_list(page, count, source_title, keywords): q = eventregistry.QueryArticlesIter(lang="eng", sourceUri=get_source_uri(source_title), dateEnd=datetime.datetime.now(), keywords=keywords, keywordsLoc="title") q.setRequestedResult( eventregistry.RequestArticlesInfo( page=page, count=count, returnInfo=eventregistry.ReturnInfo( articleInfo=eventregistry.ArticleInfoFlags( body=False, categories=False, image=True, videos=False)))) res = er.execQuery(q) l = ArticleList() article_detail_list = [] for article in res['articles']['results']: a = ArticleDetail() a.uri = article['uri'] a.title = article['title'] a.source = article['source']['title'] a.imageUrl = article['image'] if article['image'] else '' a.time = article['dateTime'] article_detail_list.append(a) l.articles.extend(article_detail_list) return l
def get_article_detail(article_uri): q = eventregistry.QueryArticle(article_uri) q.setRequestedResult( eventregistry.RequestArticleInfo(returnInfo=eventregistry.ReturnInfo( articleInfo=eventregistry.ArticleInfoFlags( body=True, categories=True, image=True, videos=False)))) res = er.execQuery(q) a_proto = ArticleDetail() a_json = res[article_uri]["info"] a_proto.title = a_json["title"] a_proto.body = a_json["body"] a_proto.imageUrl = a_json["image"] if a_json["image"] else "" cate_str = "" for category in a_json["categories"]: cate_str += category["label"].split("/")[-1] + "; " a_proto.category = cate_str[:-2] return a_proto
def get_events_info(events_fnm, out_fnm, api_key): """Get info for list of event URIs.""" er = connect_to_er(api_key) with open(events_fnm) as infile: event_uri_res = json.load(infile) n_events = event_uri_res["uriList"]["count"] event_uri_list = event_uri_res["uriList"]["results"] print "Downloading info for %d events" % n_events print "Writing output into %s" % out_fnm loc = 0 step = 200 mode = "w" if os.path.isfile(out_fnm): print "file %s already exists - continuing from last downloaded event" % out_fnm with open(out_fnm) as infile: lines = infile.readlines() if len(lines) > 0: last_event = json.loads(lines[-1]) # check if last event in right location try: assert event_uri_list[len(lines)-1] == last_event['info']['uri'], \ "last event in file not in correct location" except: pdb.set_trace() loc = len(lines) mode = "a" print "starting at event %d" % loc with open(out_fnm, mode) as outfile: while loc < n_events: end = min(n_events, loc + step) print "\rdownloading info for events: %d - %d" % (loc, end), sys.stdout.flush() batch = event_uri_list[loc:end] query = ER.QueryEvent(batch) query.setRequestedResult( ER.RequestEventInfo(returnInfo=ER.ReturnInfo( eventInfo=ER.EventInfoFlags(title=True, summary=True, articleCounts=True, concepts=True, categories=True, location=True, date=True, commonDates=False, stories=False, socialScore=True, imageCount=0)))) batch_res = er.execQuery(query) # dump events into files in the same order as they are in the batch for event_uri in batch: outfile.write(json.dumps(batch_res[event_uri]) + '\n') loc = loc + 200
def fetch_event_articles(api_key, min_articles=500, force=False, save_on_api_fail=True, csv_file=None): event_registry = er.EventRegistry(apiKey=api_key, repeatFailedRequestCount=2) # Single query to collect event ids all_events_gzip_file = op.join('csv', 'events_min%d.csv' % min_articles) + '.gz' if not force and op.exists(all_events_gzip_file): df_events = pd.read_csv(all_events_gzip_file, compression='gzip') else: event_data = [] qei = er.QueryEventsIter(lang='eng', minArticlesInEvent=min_articles, maxArticlesInEvent=min_articles * 10) for event in qei.execQuery(event_registry, maxItems=1001): event_data.append(event) df_events = pd.DataFrame(event_data) df_events.to_csv(all_events_gzip_file, encoding='utf-8', compression='gzip') del event_data # Uncache csv file. if not force and op.exists(csv_file): print("Loading articles from disk...") df_articles = pd.read_csv(csv_file) else: event_uris = df_events.uri.tolist() event_uris = [ev for ev in event_uris if ev[:3] == 'eng'] print("Downloading articles for %d events..." % len(event_uris)) # Loop to retrieve all articles for an event. return_info = er.ReturnInfo(articleInfo=er.ArticleInfoFlags( bodyLen=-1, concepts=True, categories=True, originalArticle=True)) all_articles = [] api_failed = False for uri in event_uris: print "current uri: ", uri current_event_data = [] event_gzip_file = op.join('csv', 'event-%s.csv.gz' % uri) if not force and op.exists(event_gzip_file): tmp_df = pd.read_csv(event_gzip_file, compression='gzip') elif api_failed: print("\tSkipping; API failed.") try: query_iter = er.QueryEventArticlesIter(uri) for article in query_iter.execQuery( event_registry, lang="eng", returnInfo=return_info): current_event_data.append(article) except TypeError: # This is how API errors come through. if save_on_api_fail: print("\tWARNING: API failed. Skipping.") api_failed = True # end loop; we can't continue. continue else: raise # Specify columns, so that we skip any empty events. tmp_df = pd.DataFrame(current_event_data, columns=[ 'body', 'categories', 'concepts', 'date', 'dateTime', 'eventUri', 'id', 'isDuplicate', 'lang', 'originalArticle', 'sim', 'source', 'time', 'title', 'uri', 'url' ]) tmp_df.to_csv(event_gzip_file, encoding='utf-8', compression='gzip') if len(tmp_df) == 0: print("WARNING: event contains no articles.") # print "shape of df: {}".format(tmp_df.shape) # print "unique url: {}".format(len(set(tmp_df['url']))) all_articles.append(tmp_df) # Combine all news articles into a single dataframe. df_articles = pd.concat(all_articles) csv_file = csv_file or 'articles-min%d.csv' % min_articles df_articles.to_csv(csv_file, encoding='utf-8') return df_events, df_articles