def topic_focal_set_sentences_compare(topics_id, focal_sets_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) all_focal_sets = topic_focal_sets(user_mediacloud_key(), topics_id, snapshots_id) # need the timespan info, to find the appropriate timespan with each focus base_snapshot_timespans = cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) # if they have a focus selected, we need to find the appropriate overall timespan if foci_id is not None: timespan = topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) for t in base_snapshot_timespans: if timespans_match(timespan, t): base_timespan = t else: base_timespan = None for t in base_snapshot_timespans: if t['timespans_id'] == int(timespans_id): base_timespan = t logger.info('base timespan = %s', timespans_id) if base_timespan is None: return json_error_response("Couldn't find the timespan you specified") # iterate through to find the one of interest focal_set = None for fs in all_focal_sets: if int(fs['focal_sets_id']) == int(focal_sets_id): focal_set = fs if focal_set is None: return json_error_response('Invalid Focal Set Id') # collect the sentence counts for each foci for focus in focal_set['foci']: # find the matching timespan within this focus snapshot_timespans = cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=focus['foci_id']) timespan = None for t in snapshot_timespans: if timespans_match(t, base_timespan): timespan = t logger.info('matching in focus %s, timespan = %s', focus['foci_id'], t['timespans_id']) if timespan is None: return json_error_response( 'Couldn\'t find a matching timespan in the ' + focus.name + ' focus') data = topic_sentence_counts(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespan['timespans_id'], foci_id=focus['foci_id']) focus['sentence_counts'] = data return jsonify(focal_set)
def story_counts_by_snapshot(topics_id): user_mc = user_mediacloud_client(user_mediacloud_key()) snapshots = user_mc.topicSnapshotList(topics_id) counts = {} for s in snapshots: # get the count of stories in the overally timespan for this snapshot timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=s['snapshots_id'], foci_id=None) try: total = timespans[0]['story_count'] except mediacloud.error.MCException: total = 0 except IndexError: # this doesn't have any snapshots (ie. it failed to generate correctly) total = 0 # search by tag to find out how many stories were spidered spidered = 0 try: spidered = apicache.topic_story_count(user_mediacloud_key(), topics_id, snapshots_id=s['snapshots_id'], foci_id=None, timespans_id=timespans[0]['timespans_id'], q="* AND tags_id_stories:{}".format(TAG_SPIDERED_STORY))['count'] except mediacloud.error.MCException: spidered = 0 except IndexError: # this doesn't have any snapshots (ie. it failed to generate correctly) total = 0 seeded = total - spidered counts[s['snapshots_id']] = {'total': total, 'spidered': spidered, 'seeded': seeded} return jsonify(counts)
def topic_w2v_timespan_embeddings(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # Retrieve embeddings for overall topic overall_word_counts = apicache.topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, snapshots_id=snapshots_id, timespans_id=None, foci_id=foci_id, q=q) overall_words = [x['term'] for x in overall_word_counts] overall_embeddings = { x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts } # Retrieve top words for each timespan timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id, foci_id) # Retrieve embeddings for each timespan jobs = [{ 'api_key': user_mediacloud_key(), 'topics_id': topics_id, 'snapshots_id': snapshots_id, 'foci_id': foci_id, 'overall_words': overall_words, 'overall_embeddings': overall_embeddings, 'q': q, 'timespan': t, } for t in timespans] embeddings_by_timespan = _get_all_timespan_embeddings(jobs) return jsonify({'list': embeddings_by_timespan})
def get_top_countries_by_story_tag_counts(topics_id, num_countries): tag_country_counts = [] # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # get the top countries by the story tag counts iwth overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query) # make sure the geo tag is in the geo_tags whitelist (is a country) country_tag_counts = [r for r in top_geo_tags if int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())] country_tag_counts = country_tag_counts[:num_countries] # for each country, set up the requisite info for UI for tag in country_tag_counts: tag_country_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories), # story_tag_count / total story per topic count }) return tag_country_counts
def topic_w2v_timespan_embeddings(topics_id): args = { 'snapshots_id': request.args.get('snapshotId'), 'foci_id': request.args.get('focusId'), 'q': request.args.get('q'), } # Retrieve embeddings for overall topic overall_word_counts = topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, **args) overall_words = [x['term'] for x in overall_word_counts] overall_embeddings = { x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts } # Retrieve top words for each timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, args['snapshots_id'], args['foci_id']) # Retrieve embeddings for each timespan p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES) func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id, args, overall_words, overall_embeddings) ts_embeddings = p.map(func, timespans) return jsonify({'list': ts_embeddings})
def get_top_themes_by_story_tag_counts(topics_id, num_themes): user_mc_key = user_mediacloud_key() nyt_counts = [] #get overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) # get the top themes by the story counts iwth overall timespan top_nyt_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, NYT_LABELS_TAG_SET_ID, TAG_COUNT_SAMPLE_SIZE, timespan_query) # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] top_nyt_tags = top_nyt_tags[:num_themes] # for each country, set up the requisite info for UI for tag in top_nyt_tags: nyt_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories), #story_tag_count / total story per topic count }) return nyt_counts
def _find_overall_timespan(topics_id, snapshots_id): selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) for timespan in selected_snapshot_timespans: if timespan['period'] == 'overall': return timespan raise RuntimeError('Missing overall timespan in snapshot {} (topic {})!'.format(snapshots_id, topics_id))
def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t try: focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) except ValueError: return json_error_response('Invalid Focal Set Id') timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def _find_overall_timespan(topics_id, snapshots_id): selected_snapshot_timespans = apicache.cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) for timespan in selected_snapshot_timespans: if timespan['period'] == 'overall': return timespan raise RuntimeError( 'Missing overall timespan in snapshot {} (topic {})!'.format( snapshots_id, topics_id))
def base_snapshot_timespan(topics_id): # find the timespan matching this one in the base snapshot (ie. with no foci_id) snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) base_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=None) timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) # the selected timespan for t in base_snapshot_timespans: if apicache.is_timespans_match(timespan, t): return t raise ValueError("Can't find a timespan in the base snapshot matching the one specified")
def topic_timespan_list(topics_id, snapshots_id): ignored_snapshots_id, _timespans_id, foci_id, _q = filters_from_args( request.args) timespans = apicache.cached_topic_timespan_list(topics_id, snapshots_id, foci_id) # add the focal_set type to the timespan so we can use that in the client (ie. decide what to show or not # based on what type of focal_set this timespan is part of) focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(), topics_id, snapshots_id) for t in timespans: for fs in focal_sets: for f in fs['foci']: if f['foci_id'] == t['foci_id']: t['focal_set'] = fs t['focus'] = f break return jsonify({'list': timespans})
def get_topic_info_per_snapshot_timespan(topic_id): if not is_user_logged_in(): local_mc = mc else: local_mc = user_admin_mediacloud_client() snapshots = { 'list': local_mc.topicSnapshotList(topic_id), } most_recent_running_snapshot = {} overall_timespan = {} for snp in snapshots['list']: if snp['searchable'] == 1 and snp['state'] == "completed": most_recent_running_snapshot = snp timespans = cached_topic_timespan_list(user_mediacloud_key(), topic_id, most_recent_running_snapshot['snapshots_id']) for ts in timespans: if ts['period'] == "overall": overall_timespan = ts return {'snapshot': most_recent_running_snapshot, 'timespan': overall_timespan}
def topic_w2v_timespan_embeddings(topics_id): args = { 'snapshots_id': request.args.get('snapshotId'), 'foci_id': request.args.get('focusId'), 'q': request.args.get('q'), } # Retrieve embeddings for overall topic overall_word_counts = topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, **args) overall_words = [x['term'] for x in overall_word_counts] overall_embeddings = {x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts} # Retrieve top words for each timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, args['snapshots_id'], args['foci_id']) # Retrieve embeddings for each timespan p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES) func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id, args, overall_words, overall_embeddings) ts_embeddings = p.map(func, timespans) return jsonify({'list': ts_embeddings})
def get_topic_info_per_snapshot_timespan(topic_id): local_mc = user_admin_mediacloud_client() snapshots = { 'list': local_mc.topicSnapshotList(topic_id), } most_recent_running_snapshot = {} overall_timespan = {} for snp in snapshots['list']: if snp['searchable'] == 1 and snp['state'] == "completed": most_recent_running_snapshot = snp timespans = cached_topic_timespan_list( user_mediacloud_key(), topic_id, most_recent_running_snapshot['snapshots_id']) for ts in timespans: if ts['period'] == "overall": overall_timespan = ts return { 'snapshot': most_recent_running_snapshot, 'timespan': overall_timespan }
def get_top_countries_by_story_tag_counts(topics_id, num_countries): tag_country_counts = [] # get the total stories for a topic total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count'] # get the top countries by the story tag counts iwth overall timespan timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id) overall_timespan = [t for t in timespans if t['period'] == "overall"] overall_timespan = next(iter(overall_timespan)) timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id']) top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query) # make sure the geo tag is in the geo_tags whitelist (is a country) country_tag_counts = [ r for r in top_geo_tags if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys() ] country_tag_counts = country_tag_counts[:num_countries] # for each country, set up the requisite info for UI for tag in country_tag_counts: tag_country_counts.append({ 'label': tag['label'], 'geo_tag': tag['tag'], 'tags_id': tag['tags_id'], 'count': tag['count'], 'pct': float(tag['count']) / float(total_stories ), # story_tag_count / total story per topic count }) return tag_country_counts
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # need the timespan info, to find the appropriate timespan with each focus base_snapshot_timespans = apicache.cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) # if they have a focus selected, we need to find the appropriate overall timespan if foci_id is not None: timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) for t in base_snapshot_timespans: if apicache.is_timespans_match(timespan, t): base_timespan = t else: base_timespan = None for t in base_snapshot_timespans: if t['timespans_id'] == int(timespans_id): base_timespan = t logger.info('base timespan = %s', timespans_id) if base_timespan is None: return json_error_response("Couldn't find the timespan you specified") # iterate through to find the one of interest focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, focal_sets_id) if focal_set is None: return json_error_response('Invalid Focal Set Id') # collect the story split counts for each foci timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_split_story_counts( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['split_story_counts'] = data return jsonify(focal_set)
def topic_timespan_list(topics_id, snapshots_id): foci_id = request.args.get('focusId') timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id, foci_id) return jsonify({'list': timespans})
def topic_timespan_list(topics_id, snapshots_id): foci_id = request.args.get('focusId') timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id, foci_id) return jsonify({'list': timespans})