def discover_trendingconcepts(self, top=20):
        ## api request GetTrendingConcepts
        q = evr.GetTrendingConcepts(source="news", count=top)

        q = evr.GetTrendingConcepts(
            source="pr",
            count=top,
            conceptType=['org'],
            returnInfo=evr.ReturnInfo(conceptInfo=evr.ConceptInfoFlags(
                trendingHistory=True)))
        ret = api.execQuery(q)
        ## create dic_concepts
        dic_concepts = {
            concept['uri']: concept['trendingHistory']['news']
            for concept in ret
        }
        print(dic_concepts.keys())
        ## api request QueryEvents
        lst_events = []
        for concept_uri in dic_concepts:
            print(concept_uri)
            q = evr.QueryEvents(conceptUri=concept_uri)
            q.setRequestedResult(evr.RequestEventsInfo(sortBy="rel", count=1))
            event = self.api.execQuery(q)['events']['results']
            if event:
                event = event[0]
                event.update({
                    'search_term':
                    concept_uri.replace('http://en.wikipedia.org/wiki/', '')
                })
                event.update({'trendingHistory': dic_concepts[concept_uri]})
                lst_events.append(event)
        return dtf_concepts
Пример #2
0
def get_article_list(page, count, source_title, keywords):
    q = eventregistry.QueryArticlesIter(lang="eng",
                                        sourceUri=get_source_uri(source_title),
                                        dateEnd=datetime.datetime.now(),
                                        keywords=keywords,
                                        keywordsLoc="title")
    q.setRequestedResult(
        eventregistry.RequestArticlesInfo(
            page=page,
            count=count,
            returnInfo=eventregistry.ReturnInfo(
                articleInfo=eventregistry.ArticleInfoFlags(
                    body=False, categories=False, image=True, videos=False))))
    res = er.execQuery(q)
    l = ArticleList()
    article_detail_list = []
    for article in res['articles']['results']:
        a = ArticleDetail()
        a.uri = article['uri']
        a.title = article['title']
        a.source = article['source']['title']
        a.imageUrl = article['image'] if article['image'] else ''
        a.time = article['dateTime']
        article_detail_list.append(a)
    l.articles.extend(article_detail_list)
    return l
Пример #3
0
def get_article_detail(article_uri):
    q = eventregistry.QueryArticle(article_uri)
    q.setRequestedResult(
        eventregistry.RequestArticleInfo(returnInfo=eventregistry.ReturnInfo(
            articleInfo=eventregistry.ArticleInfoFlags(
                body=True, categories=True, image=True, videos=False))))
    res = er.execQuery(q)
    a_proto = ArticleDetail()
    a_json = res[article_uri]["info"]
    a_proto.title = a_json["title"]
    a_proto.body = a_json["body"]
    a_proto.imageUrl = a_json["image"] if a_json["image"] else ""
    cate_str = ""
    for category in a_json["categories"]:
        cate_str += category["label"].split("/")[-1] + "; "
    a_proto.category = cate_str[:-2]
    return a_proto
Пример #4
0
def get_events_info(events_fnm, out_fnm, api_key):
    """Get info for list of event URIs."""
    er = connect_to_er(api_key)

    with open(events_fnm) as infile:
        event_uri_res = json.load(infile)
    n_events = event_uri_res["uriList"]["count"]
    event_uri_list = event_uri_res["uriList"]["results"]

    print "Downloading info for %d events" % n_events
    print "Writing output into %s" % out_fnm

    loc = 0
    step = 200

    mode = "w"
    if os.path.isfile(out_fnm):
        print "file %s already exists - continuing from last downloaded event" % out_fnm
        with open(out_fnm) as infile:
            lines = infile.readlines()
            if len(lines) > 0:
                last_event = json.loads(lines[-1])
                # check if last event in right location
                try:
                    assert event_uri_list[len(lines)-1] == last_event['info']['uri'], \
                    "last event in file not in correct location"
                except:
                    pdb.set_trace()
                loc = len(lines)
            mode = "a"
            print "starting at event %d" % loc

    with open(out_fnm, mode) as outfile:
        while loc < n_events:
            end = min(n_events, loc + step)

            print "\rdownloading info for events: %d - %d" % (loc, end),
            sys.stdout.flush()

            batch = event_uri_list[loc:end]
            query = ER.QueryEvent(batch)
            query.setRequestedResult(
                ER.RequestEventInfo(returnInfo=ER.ReturnInfo(
                    eventInfo=ER.EventInfoFlags(title=True,
                                                summary=True,
                                                articleCounts=True,
                                                concepts=True,
                                                categories=True,
                                                location=True,
                                                date=True,
                                                commonDates=False,
                                                stories=False,
                                                socialScore=True,
                                                imageCount=0))))

            batch_res = er.execQuery(query)

            # dump events into files in the same order as they are in the batch
            for event_uri in batch:
                outfile.write(json.dumps(batch_res[event_uri]) + '\n')

            loc = loc + 200
def fetch_event_articles(api_key,
                         min_articles=500,
                         force=False,
                         save_on_api_fail=True,
                         csv_file=None):

    event_registry = er.EventRegistry(apiKey=api_key,
                                      repeatFailedRequestCount=2)

    # Single query to collect event ids
    all_events_gzip_file = op.join('csv',
                                   'events_min%d.csv' % min_articles) + '.gz'
    if not force and op.exists(all_events_gzip_file):
        df_events = pd.read_csv(all_events_gzip_file, compression='gzip')
    else:
        event_data = []
        qei = er.QueryEventsIter(lang='eng',
                                 minArticlesInEvent=min_articles,
                                 maxArticlesInEvent=min_articles * 10)
        for event in qei.execQuery(event_registry, maxItems=1001):
            event_data.append(event)
        df_events = pd.DataFrame(event_data)
        df_events.to_csv(all_events_gzip_file,
                         encoding='utf-8',
                         compression='gzip')
        del event_data

    # Uncache csv file.
    if not force and op.exists(csv_file):
        print("Loading articles from disk...")
        df_articles = pd.read_csv(csv_file)
    else:
        event_uris = df_events.uri.tolist()
        event_uris = [ev for ev in event_uris if ev[:3] == 'eng']
        print("Downloading articles for %d events..." % len(event_uris))

        # Loop to retrieve all articles for an event.
        return_info = er.ReturnInfo(articleInfo=er.ArticleInfoFlags(
            bodyLen=-1, concepts=True, categories=True, originalArticle=True))

        all_articles = []
        api_failed = False
        for uri in event_uris:
            print "current uri: ", uri
            current_event_data = []

            event_gzip_file = op.join('csv', 'event-%s.csv.gz' % uri)
            if not force and op.exists(event_gzip_file):
                tmp_df = pd.read_csv(event_gzip_file, compression='gzip')
            elif api_failed:
                print("\tSkipping; API failed.")
                try:
                    query_iter = er.QueryEventArticlesIter(uri)
                    for article in query_iter.execQuery(
                            event_registry, lang="eng",
                            returnInfo=return_info):
                        current_event_data.append(article)
                except TypeError:
                    # This is how API errors come through.
                    if save_on_api_fail:
                        print("\tWARNING: API failed. Skipping.")
                        api_failed = True  # end loop; we can't continue.
                        continue
                    else:
                        raise

                # Specify columns, so that we skip any empty events.
                tmp_df = pd.DataFrame(current_event_data,
                                      columns=[
                                          'body', 'categories', 'concepts',
                                          'date', 'dateTime', 'eventUri', 'id',
                                          'isDuplicate', 'lang',
                                          'originalArticle', 'sim', 'source',
                                          'time', 'title', 'uri', 'url'
                                      ])
                tmp_df.to_csv(event_gzip_file,
                              encoding='utf-8',
                              compression='gzip')

            if len(tmp_df) == 0:
                print("WARNING: event contains no articles.")
            # print "shape of df: {}".format(tmp_df.shape)
            # print "unique url: {}".format(len(set(tmp_df['url'])))
            all_articles.append(tmp_df)

        # Combine all news articles into a single dataframe.
        df_articles = pd.concat(all_articles)
        csv_file = csv_file or 'articles-min%d.csv' % min_articles
        df_articles.to_csv(csv_file, encoding='utf-8')

    return df_events, df_articles