def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def map_files(topics_id): files = { 'wordMap': 'unsupported', 'linkMap': 'not rendered' } if access_public_topic(topics_id) or is_user_logged_in(): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) map_type = MAP_TYPES[0] # no linkMaps yet prefix = _get_file_prefix(map_type, topics_id, timespans_id) lock_filename = prefix+".lock" rendered_filename = prefix+".gexf" # check if rendered file is there is_rendered = os.path.isfile(os.path.join(DATA_DIR, rendered_filename)) # logger.warn(os.path.join(DATA_DIR,rendered_filename)) # logger.warn(is_rendered) if is_rendered: status = 'rendered' else: lockfile_path = os.path.join(DATA_DIR, lock_filename) is_generating = os.path.isfile(lockfile_path) if not is_generating: status = 'starting' _start_generating_map_file(map_type, topics_id, timespans_id) else: status = 'generating' files[map_type] = status return jsonify(files) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t try: focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) except ValueError: return json_error_response('Invalid Focal Set Id') timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def topic_split_story_counts(user_mc_key, topics_id, **kwargs): ''' Return setence counts over timebased on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) timespan = topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'fq': timespan['fq'] } merged_args.update( kwargs ) # passed in args override anything pulled form the request.args # and make sure to ignore undateable stories undateable_query_part = "-(tags_id_stories:{})".format( STORY_UNDATEABLE_TAG) # doesn't work if the query includes parens!!! if (merged_args['q'] is not None) and (len(merged_args['q']) > 0): merged_args['q'] = "(({}) AND {})".format(merged_args['q'], undateable_query_part) else: merged_args['q'] = "* AND {}".format(undateable_query_part) results = _cached_topic_split_story_counts(user_mc_key, topics_id, **merged_args) results['counts'] = add_missing_dates_to_split_story_counts( results['counts'], datetime.strptime(timespan['start_date'], mc.SENTENCE_PUBLISH_DATE_FORMAT), datetime.strptime(timespan['end_date'], mc.SENTENCE_PUBLISH_DATE_FORMAT)) return results
def map_files(topics_id): files = {'wordMap': 'unsupported', 'linkMap': 'not rendered'} if access_public_topic(topics_id) or is_user_logged_in(): snapshots_id, timespans_id, foci_id, q = filters_from_args( request.args) map_type = MAP_TYPES[0] # no linkMaps yet status = None prefix = _get_file_prefix(map_type, topics_id, timespans_id) lock_filename = prefix + ".lock" rendered_filename = prefix + ".gexf" # check if rendered file is there is_rendered = os.path.isfile(os.path.join(DATA_DIR, rendered_filename)) #logger.warn(os.path.join(DATA_DIR,rendered_filename)) #logger.warn(is_rendered) if is_rendered: status = 'rendered' else: is_generating = os.path.isfile( os.path.join(DATA_DIR, lock_filename)) if not is_generating: _start_generating_map_file(map_type, topics_id, timespans_id) status = 'generating' files[map_type] = status return jsonify(files) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
def topic_words(topics_id): sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE if access_public_topic(topics_id): results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): # grab the top words, respecting all the filters results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) totals = [] # important so that these get reset on the client when they aren't requested logger.debug(request.args) if (is_user_logged_in()) and ('withTotals' in request.args) and (request.args['withTotals'] == "true"): # return along with the results for the overall timespan, to facilitate comparison snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) overall_timespan = _find_overall_timespan(topics_id, snapshots_id) totals = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size, timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None) response = { 'list': results[:WORD_COUNT_UI_NUM_WORDS], 'totals': totals[:WORD_COUNT_UI_NUM_WORDS], 'sample_size': str(sample_size) } return jsonify(response)
def topic_split_story_counts(user_mc_key, topics_id, **kwargs): ''' Return setence counts over timebased on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) timespan = topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'fq': timespan['fq'] } merged_args.update(kwargs) # passed in args override anything pulled form the request.args # and make sure to ignore undateable stories undateable_query_part = "-(tags_id_stories:{})".format(STORY_UNDATEABLE_TAG) # doesn't work if the query includes parens!!! if (merged_args['q'] is not None) and (len(merged_args['q']) > 0): merged_args['q'] = "(({}) AND {})".format(merged_args['q'], undateable_query_part) else: merged_args['q'] = "* AND {}".format(undateable_query_part) results = _cached_topic_split_story_counts(user_mc_key, topics_id, **merged_args) results['counts'] = add_missing_dates_to_split_story_counts(results['counts'], datetime.strptime(timespan['start_date'], mc.SENTENCE_PUBLISH_DATE_FORMAT), datetime.strptime(timespan['end_date'], mc.SENTENCE_PUBLISH_DATE_FORMAT)) return results
def topic_similar_words(topics_id, word): # no need for user-specific cache on this snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) results = _word2vec_topic_similar_words(topics_id, snapshots_id, [word]) if len(results): return results[0]['results'] return []
def topic_similar_words(topics_id, word): # no need for user-specific cache on this snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) results = _word2vec_topic_similar_words(topics_id, snapshots_id, [word]) if len(results): return results[0]['results'] return []
def topic_word_counts(user_mc_key, topics_id, **kwargs): ''' Return sampled word counts based on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sample_size': WORD_COUNT_SAMPLE_SIZE, 'num_words': WORD_COUNT_UI_NUM_WORDS } merged_args.update( kwargs ) # passed in args override anything pulled form the request.args word_data = _cached_topic_word_counts(user_mc_key, topics_id, **merged_args) words = [w['term'] for w in word_data] # and now add in word2vec model position data google_word2vec_data = _cached_word2vec_google_2d_results(words) for i in range(len(google_word2vec_data)): word_data[i]['google_w2v_x'] = google_word2vec_data[i]['x'] word_data[i]['google_w2v_y'] = google_word2vec_data[i]['y'] topic_word2vec_data = _word2vec_topic_2d_results(topics_id, snapshots_id, words) for i in range(len(topic_word2vec_data)): word_data[i]['w2v_x'] = topic_word2vec_data[i]['x'] word_data[i]['w2v_y'] = topic_word2vec_data[i]['y'] return word_data
def topic_w2v_timespan_embeddings(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # Retrieve embeddings for overall topic overall_word_counts = apicache.topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, snapshots_id=snapshots_id, timespans_id=None, foci_id=foci_id, q=q) overall_words = [x['term'] for x in overall_word_counts] overall_embeddings = { x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts } # Retrieve top words for each timespan timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id, foci_id) # Retrieve embeddings for each timespan jobs = [{ 'api_key': user_mediacloud_key(), 'topics_id': topics_id, 'snapshots_id': snapshots_id, 'foci_id': foci_id, 'overall_words': overall_words, 'overall_embeddings': overall_embeddings, 'q': q, 'timespan': t, } for t in timespans] embeddings_by_timespan = _get_all_timespan_embeddings(jobs) return jsonify({'list': embeddings_by_timespan})
def media_outlinks_csv(topics_id, media_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) return stream_story_list_csv(user_mediacloud_key(), 'media-' + media_id + '-outlinks', topics_id, link_from_media_id=media_id, timespans_id=timespans_id, q=q)
def stream_media_list_csv(user_mc_key, topic, filename, **kwargs): filename = topic['name'] + '-' + filename # we have to make a separate call to the media info if the user wants to inlcude the media metadata include_media_metadata = ('media_metadata' in kwargs) and (kwargs['media_metadata'] == '1') # if the focusId is a URL Sharing subtopic, then we have platform-specific post/author/channel share counts include_platform_url_shares = kwargs[ 'include_platform_url_shares'] if 'include_platform_url_shares' in kwargs else False # if this topic includes platforms, then we have URL sharing counts (post/author/channel) for each platform include_all_url_shares = kwargs[ 'include_all_url_shares'] if 'include_all_url_shares' in kwargs else False params = kwargs.copy() snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'timespans_id': timespans_id, 'snapshots_id': snapshots_id, 'foci_id': foci_id, 'q': q, 'sort': request.args.get('sort') if 'sort' in request.args else None, } params.update(merged_args) # do a check to see if the user has added in a real query or not if 'q' in params: params['q'] = params['q'] if 'q' not in [ None, '', 'null', 'undefined' ] else None params[ 'limit'] = 1000 # an arbitrary value to let us page through with big topics (note, this is the page size) # set up the dict keys / column headers that the user cares about for this download props = TOPIC_MEDIA_CSV_PROPS if include_platform_url_shares: props += ['post_count', 'channel_count', 'author_count'] if include_all_url_shares: # if the user requested to download all the url sharing counts by platform, we need to grab the config for that # which is held in the platform seed query objects topic_seed_queries = topic['topic_seed_queries'] extra_columns = [] for tsq in topic_seed_queries: prefix = platform_csv_column_header_prefix(tsq) extra_columns += [ prefix + 'post_count', prefix + 'channel_count', prefix + 'author_count' ] props += extra_columns params['topic_seed_queries'] = topic_seed_queries if include_media_metadata: props += [ 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type' ] timestamped_filename = csv.safe_filename(filename) headers = { "Content-Disposition": "attachment;filename=" + timestamped_filename } return Response(_stream_media_by_page(user_mc_key, topic['topics_id'], props, **params), mimetype='text/csv; charset=utf-8', headers=headers)
def base_snapshot_timespan(topics_id): # find the timespan matching this one in the base snapshot (ie. with no foci_id) snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) base_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=None) timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) # the selected timespan for t in base_snapshot_timespans: if apicache.is_timespans_match(timespan, t): return t raise ValueError("Can't find a timespan in the base snapshot matching the one specified")
def topic_focal_set_sentences_compare(topics_id, focal_sets_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) all_focal_sets = topic_focal_sets(user_mediacloud_key(), topics_id, snapshots_id) # need the timespan info, to find the appropriate timespan with each focus base_snapshot_timespans = cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) # if they have a focus selected, we need to find the appropriate overall timespan if foci_id is not None: timespan = topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) for t in base_snapshot_timespans: if timespans_match(timespan, t): base_timespan = t else: base_timespan = None for t in base_snapshot_timespans: if t['timespans_id'] == int(timespans_id): base_timespan = t logger.info('base timespan = %s', timespans_id) if base_timespan is None: return json_error_response("Couldn't find the timespan you specified") # iterate through to find the one of interest focal_set = None for fs in all_focal_sets: if int(fs['focal_sets_id']) == int(focal_sets_id): focal_set = fs if focal_set is None: return json_error_response('Invalid Focal Set Id') # collect the sentence counts for each foci for focus in focal_set['foci']: # find the matching timespan within this focus snapshot_timespans = cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=focus['foci_id']) timespan = None for t in snapshot_timespans: if timespans_match(t, base_timespan): timespan = t logger.info('matching in focus %s, timespan = %s', focus['foci_id'], t['timespans_id']) if timespan is None: return json_error_response( 'Couldn\'t find a matching timespan in the ' + focus.name + ' focus') data = topic_sentence_counts(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespan['timespans_id'], foci_id=focus['foci_id']) focus['sentence_counts'] = data return jsonify(focal_set)
def topic_focal_set_list(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) include_story_counts = request.args.get('includeStoryCounts') focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(), topics_id, snapshots_id) # now mark the ones that are the magically added URL sharing platform ones for fs in focal_sets: fs['is_url_sharing'] = is_url_sharing_focal_set(fs) if include_story_counts and (include_story_counts == u'1'): _add_story_counts_to_foci(topics_id, focal_sets) return jsonify(focal_sets)
def topic_media_csv(topics_id): sort = validated_sort(request.args.get('sort')) snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) return _stream_media_list_csv(user_mediacloud_key(), 'media-for-topic-' + topics_id, topics_id, sort=sort, snapshots_id=snapshots_id, timespans_id=timespans_id, foci_id=foci_id, q=q)
def matching_timespans_in_foci(topics_id, timespan_to_match, foci): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) timespans = [] for focus in foci: # find the matching timespan within this focus snapshot_timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=focus['foci_id']) timespan = _matching_timespan(timespan_to_match, snapshot_timespans) timespans.append(timespan) # if timespan is None: # return json_error_response('Couldn\'t find a matching timespan in the '+focus.name+' focus') return timespans
def topic_sentence_sample(user_mc_key, topics_id, sample_size=1000, **kwargs): ''' Return a sample of sentences based on the filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q } merged_args.update(kwargs) # passed in args override anything pulled form the request.args return _cached_topic_sentence_sample(user_mc_key, topics_id, sample_size, **merged_args)
def topic_sentence_sample(user_mc_key, topics_id, sample_size=1000, **kwargs): """ Return a sample of sentences based on the filters. """ snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q } merged_args.update(kwargs) # passed in args override anything pulled form the request.args return _cached_topic_sentence_sample(user_mc_key, sample_size, **merged_args)
def topic_tag_counts(user_mc_key, topics_id, tag_sets_id): """ Get a breakdown of the most-used tags within a set within a single timespan. This supports just timespan_id and q from the request, because it has to use sentenceFieldCount, not a topicSentenceFieldCount method that takes filters (which doesn't exit) """ snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) timespan_query = "timespans_id:{}".format(timespans_id) if (q is None) or (len(q) == 0): query = timespan_query else: query = "({}) AND ({})".format(q, timespan_query) return _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, query)
def topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size=None): ''' Get a breakdown of the most-used tags within a set within a single timespan. This supports just timespan_id and q from the request, because it has to use sentenceFieldCount, not a topicSentenceFieldCount method that takes filters (which doesn't exit) ''' # return [] # SUPER HACK! snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) timespan_query = "timespans_id:{}".format(timespans_id) if (q is None) or (len(q) == 0): query = timespan_query else: query = "({}) AND ({})".format(q, timespan_query) return _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size, query)
def topic_story_count(user_mc_key, topics_id, **kwargs): """ Return filtered story count within topic. """ snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q } merged_args.update(kwargs) # passed in args override anything pulled form the request.args # logger.info("!!!!!"+str(merged_args['timespans_id'])) return _cached_topic_story_count(user_mc_key, topics_id, **merged_args)
def topic_story_count(user_mc_key, topics_id, **kwargs): ''' Return filtered story count within topic. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q } merged_args.update(kwargs) # passed in args override anything pulled form the request.args # logger.info("!!!!!"+str(merged_args['timespans_id'])) return _cached_topic_story_count(user_mc_key, topics_id, **merged_args)
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # need the timespan info, to find the appropriate timespan with each focus try: base_timespan = base_snapshot_timespan(topics_id) focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, focal_sets_id) except ValueError as e: return json_error_response(e.message) # collect the story split counts for each foci timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_split_story_counts(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['split_story_counts'] = data return jsonify(focal_set)
def topic_media_list(user_mc_key, topics_id, **kwargs): """ Return sorted media list based on filters. """ snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sort': request.args.get('sort'), 'limit': request.args.get('limit'), 'link_id': request.args.get('linkId'), } merged_args.update(kwargs) # passed in args override anything pulled form the request.args return _cached_topic_media(user_mc_key, topics_id, **merged_args)
def topic_media_list(user_mc_key, topics_id, **kwargs): ''' Return sorted media list based on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sort': validated_sort(request.args.get('sort')), 'limit': request.args.get('limit'), 'link_id': request.args.get('linkId'), } merged_args.update(kwargs) # passed in args override anything pulled form the request.args return _cached_topic_media_list_with_metadata(user_mc_key, topics_id, **merged_args)
def matching_timespans_in_foci(topics_id, timespan_to_match, foci): """ For cross-subtopic analysis within a subtopic set, we need to identify the timespan that has the same date range in each subtopic within the set. This helper does that annoying work for you. """ snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) timespans = [] for focus in foci: # find the matching timespan within this focus snapshot_timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=focus['foci_id']) timespan = _matching_timespan(timespan_to_match, snapshot_timespans) timespans.append(timespan) # if timespan is None: # return json_error_response('Couldn\'t find a matching timespan in the '+focus.name+' focus') return timespans
def topic_timespan_list(topics_id, snapshots_id): ignored_snapshots_id, _timespans_id, foci_id, _q = filters_from_args( request.args) timespans = apicache.cached_topic_timespan_list(topics_id, snapshots_id, foci_id) # add the focal_set type to the timespan so we can use that in the client (ie. decide what to show or not # based on what type of focal_set this timespan is part of) focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(), topics_id, snapshots_id) for t in timespans: for fs in focal_sets: for f in fs['foci']: if f['foci_id'] == t['foci_id']: t['focal_set'] = fs t['focus'] = f break return jsonify({'list': timespans})
def topic_sentence_sample(user_mc_key, sample_size=1000, **kwargs): """ Return a sample of sentences based on the filters. A topic ID isn't needed because there is no topicSentenceList endpoint. Random sentence samples are pulled by using the timespans_id with a regular sentenceList call. """ snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q } merged_args.update( kwargs ) # passed in args override anything pulled form the request.args return _cached_topic_sentence_sample(user_mc_key, sample_size, **merged_args)
def matching_timespans_in_foci(topics_id, timespan_to_match, foci): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) timespans = [] for focus in foci: # find the matching timespan within this focus snapshot_timespans = cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, foci_id=focus['foci_id']) timespan = _matching_timespan(timespan_to_match, snapshot_timespans) timespans.append(timespan) # if timespan is None: # return json_error_response('Couldn\'t find a matching timespan in the '+focus.name+' focus') return timespans
def topic_words(topics_id): sample_size = request.args[ 'sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE if access_public_topic(topics_id): results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size, snapshots_id=None, timespans_id=None, foci_id=None, q=None) elif is_user_logged_in(): # grab the top words, respecting all the filters results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size) else: return jsonify({'status': 'Error', 'message': 'Invalid attempt'}) totals = [ ] # important so that these get reset on the client when they aren't requested logger.debug(request.args) if (is_user_logged_in()) and ('withTotals' in request.args) and ( request.args['withTotals'] == "true"): # return along with the results for the overall timespan, to facilitate comparison snapshots_id, timespans_id, foci_id, q = filters_from_args( request.args) overall_timespan = _find_overall_timespan(topics_id, snapshots_id) totals = apicache.topic_word_counts( user_mediacloud_key(), topics_id, sample_size=sample_size, timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None) response = { 'list': results[:WORD_COUNT_UI_NUM_WORDS], 'totals': totals[:WORD_COUNT_UI_NUM_WORDS], 'sample_size': str(sample_size) } return jsonify(response)
def topic_story_list(user_mc_key, topics_id, **kwargs): ''' Return sorted story list based on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sort': validated_sort(request.args.get('sort')), 'limit': request.args.get('limit'), 'link_id': request.args.get('linkId'), } merged_args.update(kwargs) # passed in args override anything pulled form the request.args results = _cached_topic_story_list(user_mc_key, topics_id, **merged_args) if merged_args['limit']: # TODO: remove this (force limit as workaround to back-end bug) results['stories'] = results['stories'][:int(merged_args['limit'])] return results
def _add_story_counts_to_foci(topics_id, focal_sets): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # need the timespan info, to find the appropriate timespan with each focus try: base_timespan = base_snapshot_timespan(topics_id) except ValueError as e: return json_error_response(e.message) # now find the story count in each foci in this for fs in focal_sets: timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, fs['foci']) for idx in range(0, len(timespans)): timespan = timespans[idx] focus = fs['foci'][idx] foci_story_count = apicache.topic_story_count(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespan['timespans_id'], q=q, foci_id=focus['foci_id'])['count'] focus['story_count'] = foci_story_count return jsonify(focal_sets)
def topic_story_list(user_mc_key, topics_id, **kwargs): # Return sorted story list based on filters. snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # these are the arguments support by the low-level API method merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sort': request.args.get('sort'), 'limit': request.args.get('limit'), 'link_id': request.args.get('linkId'), } # make sure not to add in other parameters from kwargs that aren't supported by the API method for k in TOPIC_STORY_LIST_API_PARAMS: if (k in merged_args) and (k in kwargs): merged_args[k] = kwargs[k] results = _cached_topic_story_list(user_mc_key, topics_id, **merged_args) if merged_args['limit']: # TODO: remove this (this enforces the limit as a workaround to a back-end bug) results['stories'] = results['stories'][:int(merged_args['limit'])] return results
def topic_tag_coverage(topics_id, tags_id): ''' Useful for seeing how many stories in the topic are tagged with a specific tag ''' # respect any query filter the user has set snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) query_with_tag = add_to_user_query("tags_id_stories:{}".format(tags_id)) # now get the counts if access_public_topic(topics_id): total = topic_story_count(TOOL_API_KEY, topics_id) tagged = topic_story_count( TOOL_API_KEY, topics_id, q=query_with_tag) # force a count with just the query elif is_user_logged_in(): total = topic_story_count(user_mediacloud_key(), topics_id) tagged = topic_story_count( user_mediacloud_key(), topics_id, q=query_with_tag) # force a count with just the query else: return None return {'counts': {'count': tagged['count'], 'total': total['count']}}
def topic_sentence_counts(user_mc_key, topics_id, **kwargs): ''' Return setence counts over timebased on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q } merged_args.update( kwargs ) # passed in args override anything pulled form the request.args # and make sure to ignore undateable stories undateable_query_part = "NOT tags_id_stories:{}".format( STORY_UNDATEABLE_TAG) # doesn't work if the query includes parens!!! if merged_args['q'] is not None: merged_args['q'] += " AND {}".format(undateable_query_part) else: merged_args['q'] = "* AND {}".format(undateable_query_part) return _cached_topic_sentence_counts(user_mc_key, topics_id, **merged_args)
def topic_word_counts(user_mc_key, topics_id, **kwargs): ''' Return sampled word counts based on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sample_size': 1000 } merged_args.update( kwargs ) # passed in args override anything pulled form the request.args word_data = _cached_topic_word_counts(user_mc_key, topics_id, **merged_args) words = [w['term'] for w in word_data] word2vec_data = _cached_word2vec_google_2d_results(words) for i in range(len(word2vec_data)): word_data[i]['google_w2v_x'] = word2vec_data[i]['x'] word_data[i]['google_w2v_y'] = word2vec_data[i]['y'] return word_data
def topic_story_list(user_mc_key, topics_id, **kwargs): ''' Return sorted story list based on filters. ''' snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sort': validated_sort(request.args.get('sort')), 'limit': request.args.get('limit'), 'link_id': request.args.get('linkId'), } merged_args.update( kwargs ) # passed in args override anything pulled form the request.args results = _cached_topic_story_list(user_mc_key, topics_id, **merged_args) if merged_args[ 'limit']: # TODO: remove this (force limit as workaround to back-end bug) results['stories'] = results['stories'][:int(merged_args['limit'])] return results
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) # need the timespan info, to find the appropriate timespan with each focus base_snapshot_timespans = apicache.cached_topic_timespan_list( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) # if they have a focus selected, we need to find the appropriate overall timespan if foci_id is not None: timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id) for t in base_snapshot_timespans: if apicache.is_timespans_match(timespan, t): base_timespan = t else: base_timespan = None for t in base_snapshot_timespans: if t['timespans_id'] == int(timespans_id): base_timespan = t logger.info('base timespan = %s', timespans_id) if base_timespan is None: return json_error_response("Couldn't find the timespan you specified") # iterate through to find the one of interest focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, focal_sets_id) if focal_set is None: return json_error_response('Invalid Focal Set Id') # collect the story split counts for each foci timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_split_story_counts( user_mediacloud_key(), topics_id, snapshots_id=snapshots_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['split_story_counts'] = data return jsonify(focal_set)
def topic_word_counts(user_mc_key, topics_id, **kwargs): # Return sampled word counts based on filters. snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) merged_args = { 'snapshots_id': snapshots_id, 'timespans_id': timespans_id, 'foci_id': foci_id, 'q': q, 'sample_size': WORD_COUNT_SAMPLE_SIZE, 'num_words': WORD_COUNT_UI_NUM_WORDS } merged_args.update(kwargs) # passed in args override anything pulled form the request.args word_data = cached_topic_word_counts(user_mc_key, topics_id, **merged_args) words = [w['term'] for w in word_data] # and now add in word2vec model position data google_word2vec_data = _cached_word2vec_google_2d_results(words) for i in range(len(google_word2vec_data)): word_data[i]['google_w2v_x'] = google_word2vec_data[i]['x'] word_data[i]['google_w2v_y'] = google_word2vec_data[i]['y'] topic_word2vec_data = _word2vec_topic_2d_results(topics_id, snapshots_id, words) for i in range(len(topic_word2vec_data)): word_data[i]['w2v_x'] = topic_word2vec_data[i]['x'] word_data[i]['w2v_y'] = topic_word2vec_data[i]['y'] return word_data
def timespan_files_list(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) results = apicache.topic_timespan_files_list(topics_id, timespans_id) return jsonify(results)
def topic_media_csv(topics_id): sort = validated_sort(request.args.get('sort')) snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) return _stream_media_list_csv(user_mediacloud_key(), 'media-for-topic-' + topics_id, topics_id, sort=sort, snapshots_id=snapshots_id, timespans_id=timespans_id, foci_id=foci_id, q=q)
def media_stories_csv(topics_id, media_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) return stream_story_list_csv(user_mediacloud_key(), 'media-'+media_id+'-stories', topics_id, media_id=media_id, timespans_id=timespans_id, q=q)