def api_explorer_story_split_count_csv(): filename = 'stories-over-time' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) filename = filename # don't have this info + current_query['q'] SAMPLE_SEARCHES = load_sample_searches() queries = SAMPLE_SEARCHES[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'date': q['startDate'], 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['date','query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def api_collection_source_representation_csv(collection_id): user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id) props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct'] filename = info['label'] + "-source sentence counts.csv" return csv.stream_response(source_representation, props, filename)
def api_download_sources_template(): filename = "Collection_Template_for_sources.csv" what_type_download = SOURCES_TEMPLATE_PROPS_EDIT return csv.stream_response(what_type_download, what_type_download, filename)
def stream_story_count_csv(fn, search_id_or_query_list): ''' Helper method to stream a list of stories back to the client as a csv. Any args you pass in will be simply be passed on to a call to topicStoryList. ''' # if we have a search id, we load the samples from our sample searches file filename = '' story_count_results = [] SAMPLE_SEARCHES = load_sample_searches() try: search_id = int(search_id_or_query_list) if search_id >= 0: SAMPLE_SEARCHES = load_sample_searches() sample_queries = SAMPLE_SEARCHES[search_id]['queries'] for query in sample_queries: solr_query = prep_simple_solr_query(query) storyList = cached_story_count(solr_query) query_and_story_count = {'query' : query['label'], 'count' : storyList['count']} story_count_results.append(query_and_story_count) except Exception as e: custom_queries = json.loads(search_id_or_query_list) for query in custom_queries: solr_query = parse_query_with_keywords(query) filename = fn + query['q'] storyList = cached_story_count(solr_query) query_and_story_count = {'query' : query['label'], 'count' : storyList['count']} story_count_results.append(query_and_story_count) props = ['query','count'] return csv.stream_response(story_count_results, props, filename)
def topic_compare_subtopic_top_words(topics_id): snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args) selected_focal_sets_id = request.args['focal_sets_id'] word_count = request.args['word_count'] if 'word_count' in request.args else 20 # first we need to figure out which timespan they are working on selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id) selected_timespan = None for t in selected_snapshot_timespans: if t['timespans_id'] == int(timespans_id): selected_timespan = t try: focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id) except ValueError: return json_error_response('Invalid Focal Set Id') timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci']) for idx in range(0, len(timespans)): data = apicache.topic_word_counts(user_mediacloud_key(), topics_id, timespans_id=timespans[idx]['timespans_id']) focal_set['foci'][idx]['top_words'] = data # stitch together the counts to download now data = [] headers = [f['name'] for f in focal_set['foci']] for idx in range(0, word_count): row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count']) for f in focal_set['foci']} data.append(row) return csv.stream_response(data, headers, 'topic-{}-subtopic-{}-{}-top-words-comparison'.format( topics_id, focal_set['name'], selected_focal_sets_id))
def api_metadata_download(collection_id): all_media = media_with_tag(user_mediacloud_key(), collection_id) metadata_items = [] for media_source in all_media: for tag in media_source['media_source_tags']: if is_metadata_tag_set(tag['tag_sets_id']): found = False for dictItem in metadata_items: if dictItem['metadataId'] == tag['tag_sets_id']: temp = dictItem['tagged'] dictItem.update({'tagged': temp + 1}) found = True if not found: metadata_items.append({ 'metadataCoverage': tag['tag_set'], 'metadataId': tag['tag_sets_id'], 'tagged': 1 }) for i in metadata_items: temp = len(all_media) - i['tagged'] i.update({'notTagged': temp}) props = ['metadataCoverage', 'tagged', 'notTagged'] filename = "metadataCoverageForCollection" + collection_id + ".csv" return csv.stream_response(metadata_items, props, filename)
def api_download_sources_template(): filename = "media cloud collection upload template.csv" what_type_download = SOURCE_LIST_CSV_EDIT_PROPS return csv.stream_response(what_type_download, what_type_download, filename)
def entities_csv(topics_id, type_entity): tag_type = CLIFF_PEOPLE if type_entity == 'people' else CLIFF_ORGS top_tag_counts = topic_tag_counts(user_mediacloud_key(), topics_id, tag_type) data = process_tags_for_coverage(topics_id, top_tag_counts) return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS, 'topic-{}-entities-{}'.format(topics_id, type))
def story_words_csv(topics_id, stories_id): query = add_to_user_query('stories_id:'+stories_id) ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1 # default to word count word_counts = topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size, q=query, num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS) return csv.stream_response(word_counts, WORD_COUNT_DOWNLOAD_COLUMNS, 'topic-{}-story-{}-sampled-ngrams-{}-word'.format(topics_id, stories_id, ngram_size))
def explorer_story_count_csv(): filename = u'total-story-count' data = request.form if 'searchId' in data: # TODO: don't load this query twice because that is kind of dumb sample_searches = load_sample_searches() queries = sample_searches[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count( solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv(): filename = 'total-story-count' data = request.form if 'searchId' in data: # TODO: don't load this query twice because that is kind of dumb sample_searches = load_sample_searches() queries = sample_searches[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv(): filename = 'total-story-count' data = request.form if 'searchId' in data: queries = SAMPLE_SEARCHES[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: if (len(q['collections']) == 0) and only_queries_reddit(q['sources']): start_date, end_date = parse_query_dates(q) story_counts = pushshift.reddit_submission_normalized_and_split_story_count(query=q['q'], start_date=start_date, end_date=end_date, subreddits=pushshift.NEWS_SUBREDDITS) else: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def story_tags_csv(stories_id): # in the download include all entity types admin_mc = user_admin_mediacloud_client() if stories_id in [None, 'NaN']: return jsonify({'error': 'bad value'}) story = admin_mc.story(stories_id, text=True) # Note - this call doesn't pull cliff places props = ['tags_id', 'tag', 'tag_sets_id', 'tag_set'] return csv.stream_response(story['story_tags'], props, 'story-' + str(stories_id) + '-all-tags-and-tag-sets')
def collection_source_sentence_counts_csv(collection_id): user_mc = user_admin_mediacloud_client() info = user_mc.tag(collection_id) results = _cached_media_with_sentence_counts(user_mediacloud_key(), collection_id) props = ['media_id', 'name', 'url', 'sentence_count', 'sentence_pct'] filename = info['label'] + "-source sentence counts.csv" return csv.stream_response(results, props, filename)
def stream_story_samples_csv(filename, stories): ''' Helper method to stream a list of stories back to the client as a csv. Any args you pass in will be simply be passed on to a call to topicStoryList. ''' props = ['stories_id', 'publish_date', 'title', 'url', 'media_name','media_id', 'language'] return csv.stream_response(stories, props, filename)
def story_entities_csv(topics_id, stories_id): # in the download include all entity types entities = cached_entities(user_mediacloud_key(), stories_id) if entities is None: # none means not processed by corenlp, but for download just make it empty entities = [] props = ['type', 'name', 'words'] return csv.stream_response(entities, props, 'story-'+str(stories_id)+'-entities')
def topic_provider_words_csv(topics_id): optional_args = _parse_words_optional_arguments() results = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, **optional_args) file_name = 'topic-{}-sampled-ngrams-{}-word'.format( topics_id, optional_args['ngram_size']) return csv.stream_response(results, apicache.WORD_COUNT_DOWNLOAD_COLUMNS, file_name)
def _stream_topic_split_story_counts_csv(results, filename): clean_results = [{ 'date': trim_solr_date(item['date']), 'stories': item['count'] } for item in results['counts']] sorted_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'stories'] return csv.stream_response(sorted_results, props, filename)
def topic_words_csv(topics_id): query = apicache.add_to_user_query(None) sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1 # default to word count word_counts = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size=ngram_size, q=query, num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS, sample_size=sample_size) return csv.stream_response(word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS, 'topic-{}-sampled-ngrams-{}-word'.format(topics_id, ngram_size))
def stream_split_stories_csv(user_mc_key, filename, item_id, which): response = { 'story_splits': apicache.last_year_split_story_count(user_mc_key, [which + ":" + str(item_id)])['counts'] } clean_results = [{'date': trimSolrDate(item['date']), 'numStories': item['count']} for item in response['story_splits']] clean_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'numStories'] return csv.stream_response(clean_results, props, filename)
def media_words_csv(topics_id, media_id): query = apicache.add_to_user_query('media_id:'+media_id) ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1 # default to word count word_counts = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size=ngram_size, q=query, num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS, sample_size=WORD_COUNT_DOWNLOAD_SAMPLE_SIZE) return csv.stream_response(word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS, 'topic-{}-media-{}-sampled-ngrams-{}-word'.format(topics_id, media_id, ngram_size))
def api_collection_source_representation_csv(collection_id): user_mc = user_mediacloud_client() info = user_mc.tag(collection_id) source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id, sample_size=500, fq="publish_date:[NOW-90DAY TO NOW]") props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct'] filename = info['label'] + "-source sentence counts.csv" return csv.stream_response(source_representation, props, filename)
def stream_wordcount_csv(filename, q, fq, ngram_size=1, sample_size=WORD_COUNT_SAMPLE_SIZE): # use bigger values for CSV download num_words = WORD_COUNT_DOWNLOAD_NUM_WORDS word_counts = query_wordcount(q, fq, ngram_size, num_words, sample_size) for w in word_counts: w['sample_size'] = sample_size w['ratio'] = float(w['count'])/float(sample_size) props = ['term', 'stem', 'count', 'sample_size', 'ratio', 'google_w2v_x', 'google_w2v_y'] return csv.stream_response(word_counts, props, filename)
def explorer_geo_csv(): filename = 'sampled-geographic-coverage' data = request.form query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET) props = ['tags_id', 'label', 'count', 'pct'] return csv.stream_response(data['results'], props, filename)
def story_subreddit_shares_csv(stories_id): story = mc.story(stories_id) submissions_by_sub = pushshift.reddit_url_submissions_by_subreddit( story['url']) props = ['name', 'value'] column_names = ['subreddit', 'submissions'] return csv.stream_response(submissions_by_sub, props, 'story-' + str(stories_id) + '-subreddit', column_names=column_names)
def stream_sentence_count_csv(user_mc_key, filename, topics_id, **kwargs): results = topic_sentence_counts(user_mc_key, topics_id, **kwargs) clean_results = [{ 'date': date, 'numFound': count } for date, count in results['split'].iteritems() if date not in ['gap', 'start', 'end']] sorted_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'numFound'] return csv.stream_response(sorted_results, props, filename)
def stream_topic_split_story_counts_csv(user_mc_key, filename, topics_id, **kwargs): results = topic_split_story_counts(user_mc_key, topics_id, **kwargs) clean_results = [{ 'date': item['date'], 'stories': item['count'] } for item in results['counts']] sorted_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'stories'] return csv.stream_response(sorted_results, props, filename)
def explorer_entities_csv(tag_sets_id): tag_set = base_apicache.tag_set(tag_sets_id) filename = 'sampled-{}'.format(tag_set['label']) data = request.form query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) top_tag_counts = apicache.top_tags_with_coverage( solr_q, solr_fq, tag_sets_id, TAG_COUNT_DOWNLOAD_LENGTH)['results'] return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS, filename)
def stream_split_stories_csv(user_mc_key, filename, q): response = { 'story_splits': apicache.split_story_count(user_mc_key, q)['counts'] } clean_results = [{ 'date': trim_solr_date(item['date']), 'numStories': item['count'] } for item in response['story_splits']] clean_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'numStories'] return csv.stream_response(clean_results, props, filename)
def story_tags_csv(stories_id): # in the download include all entity types admin_mc = user_admin_mediacloud_client() if stories_id in [None, 'NaN']: return jsonify({'error': 'bad value'}) story = admin_mc.story( stories_id, text=True) # Note - this call doesn't pull cliff places props = ['tags_id', 'tag', 'tag_sets_id', 'tag_set'] return csv.stream_response( story['story_tags'], props, 'story-' + str(stories_id) + '-all-tags-and-tag-sets')
def stream_sentence_count_csv(user_mc_key, filename, item_id, which): response = {} response['sentencecounts'] = cached_recent_sentence_counts( user_mc_key, [which + ":" + str(item_id)]) clean_results = [{ 'date': date, 'numFound': count } for date, count in response['sentencecounts'].iteritems() if date not in ['gap', 'start', 'end']] clean_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'numFound'] return csv.stream_response(clean_results, props, filename)
def explorer_entities_csv(tag_sets_id): tag_set = apicache.tag_set(tag_sets_id) filename = 'sampled-{}'.format(tag_set['label']) data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) top_tag_counts = apicache.top_tags_with_coverage(solr_q, solr_fq, tag_sets_id, TAG_COUNT_DOWNLOAD_LENGTH)['results'] return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS, filename)
def topic_word_associated_words_csv(topics_id, word): query = apicache.add_to_user_query(word) ngram_size = request.args[ 'ngram_size'] if 'ngram_size' in request.args else 1 # default to word count word_counts = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size=ngram_size, q=query) return csv.stream_response( word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS, 'topic-{}-{}-sampled-ngrams-{}-word'.format(topics_id, word, ngram_size))
def stream_split_stories_csv(user_mc_key, filename, item_id, which): response = { 'story_splits': apicache.last_year_split_story_count( user_mc_key, [which + ":" + str(item_id)])['counts'] } clean_results = [{ 'date': trimSolrDate(item['date']), 'numStories': item['count'] } for item in response['story_splits']] clean_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'numStories'] return csv.stream_response(clean_results, props, filename)
def collection_source_story_split_historical_counts_csv(collection_id): results = _collection_source_story_split_historical_counts(collection_id) date_cols = None # TODO verify this for source in results: if date_cols is None: date_cols = sorted([s['date'] for s in source['splits_over_time']]) for day in source['splits_over_time']: source[day['date']] = day['count'] del source['splits_over_time'] props = ['media_id', 'media_name', 'media_url', 'total_stories', 'splits_over_time'] + date_cols filename = "{} - source content count".format(collection_id) return csv.stream_response(results, props, filename)
def explorer_geo_csv(): filename = u'sampled-geographic-coverage' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) else: query_object = json.loads(data['q']) solr_q, solr_fq = parse_query_with_keywords(query_object) filename = file_name_for_download(query_object['label'], filename) data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET) data['results'] = _filter_for_countries(data['results']) props = ['label', 'count', 'pct', 'alpha3', 'iso-a2', 'geonamesId', 'tags_id', 'tag'] return csv.stream_response(data['results'], props, filename)
def stream_wordcount_csv(filename, q, fq, ngram_size=1): # use bigger values for CSV download num_words = WORD_COUNT_DOWNLOAD_LENGTH sample_size = WORD_COUNT_SAMPLE_SIZE word_counts = query_wordcount(q, fq, ngram_size, num_words, sample_size) for w in word_counts: w['sample_size'] = sample_size w['ratio'] = float(w['count']) / float(sample_size) props = [ 'term', 'stem', 'count', 'sample_size', 'ratio', 'google_w2v_x', 'google_w2v_y' ] return csv.stream_response(word_counts, props, filename)
def collection_source_story_split_historical_counts_csv(collection_id): results = _collection_source_story_split_historical_counts(collection_id) date_cols = None # TODO verify this for source in results: if date_cols is None: date_cols = sorted([s['date'] for s in source['splits_over_time']]) for day in source['splits_over_time']: source[day['date']] = day['count'] del source['splits_over_time'] props = [ 'media_id', 'media_name', 'media_url', 'total_stories', 'splits_over_time' ] + date_cols filename = "{} - source content count".format(collection_id) return csv.stream_response(results, props, filename)
def stream_geo_csv(fn, search_id_or_query, index): filename = '' # TODO: there is duplicate code here... SAMPLE_SEARCHES = load_sample_searches() try: search_id = int(search_id_or_query) if search_id >= 0: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search( search_id, current_search) if int(index) < len(current_search): start_date = current_search[int(index)]['startDate'] end_date = current_search[int(index)]['endDate'] filename = fn + current_search[int(index)]['q'] except Exception as e: # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0 query = json.loads(search_id_or_query) current_query = query[0] solr_query = parse_query_with_keywords(current_query) filename = fn + current_query['q'] res = cached_geotags(solr_query) res = [ r for r in res if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys() ] for r in res: geonamesId = int(r['tag'].split('_')[1]) if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys( ): # only include countries continue r['geonamesId'] = geonamesId r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId] r['count'] = ( float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE) ) # WTF: why is the API returning this as a string and not a number? for hq in HIGHCHARTS_KEYS: if hq['properties']['iso-a3'] == r['alpha3']: r['iso-a2'] = hq['properties']['iso-a2'] r['value'] = r['count'] props = ['label', 'count'] return csv.stream_response(res, props, filename)
def api_metadata_download(collection_id): all_media = media_with_tag(user_mediacloud_key(), collection_id) metadata_counts = {} # from tag_sets_id to info for media_source in all_media: for metadata_label, info in media_source['metadata'].items(): if metadata_label not in metadata_counts: # lazily populate counts metadata_counts[metadata_label] = { 'metadataCoverage': metadata_label, 'tagged': 0 } if info is not None: metadata_counts[metadata_label]['tagged'] += 1 for item_info in list(metadata_counts.values()): temp = len(all_media) - item_info['tagged'] item_info.update({'notTagged': temp}) props = ['metadataCoverage', 'tagged', 'notTagged'] filename = "metadataCoverageForCollection" + collection_id + ".csv" return csv.stream_response(list(metadata_counts.values()), props, filename, ['metadata category', 'sources with info', 'sources missing info'])
def api_download_sources_template(): filename = "media cloud collection upload template.csv" what_type_download = SOURCE_LIST_CSV_EDIT_PROPS return csv.stream_response(what_type_download, what_type_download, filename)
def topic_geo_tag_counts_csv(topics_id): tags = _geo_tag_counts(user_mediacloud_key(), topics_id) data = process_tags_for_coverage(topics_id, tags) return stream_response(tags, ['tags_id', 'tag', 'label', 'count', 'pct'], "topic-{}-geo-tag-counts".format(topics_id))
def stream_geo_csv(user_mc_key, filename, item_id, which): info = {} info = cached_geotag_count(user_mc_key, which+":"+str(item_id)) props = ['label', 'count'] return csv.stream_response(info, props, filename)
def topic_nyt_tag_counts_csv(topics_id): tags = _nyt_tag_counts(user_mediacloud_key(), topics_id) return stream_response(tags['entities'], ['tags_id', 'tag', 'label', 'count', 'pct'], "topic-{}-nyt-label-counts".format(topics_id))
def stream_feed_csv(filename, media_id): response = cached_feed(media_id) props = ['name', 'type', 'url'] return csv.stream_response(response, props, filename)
def story_entities_csv(stories_id): # in the download include all entity types entities = entities_from_mc_or_cliff(stories_id) props = ['type', 'name', 'frequency'] return csv.stream_response(entities, props, 'story-'+str(stories_id)+'-entities')
def story_nyt_themes_csv(stories_id): results = nyt_themes_from_mc_or_labeller(stories_id) themes = results['descriptors600'] props = ['label', 'score'] return csv.stream_response(themes, props, 'story-'+str(stories_id)+'-nyt-themes')
def entities_csv(topics_id, type_entity): tag_type = CLIFF_PEOPLE if type_entity == 'people' else CLIFF_ORGS top_tag_counts = topic_tag_counts(user_mediacloud_key(), topics_id, tag_type) data = process_tags_for_coverage(topics_id, top_tag_counts) return csv.stream_response(top_tag_counts, ENTITY_DOWNLOAD_COLUMNS, 'topic-{}-entities-{}'.format(topics_id, type))
def stream_topic_split_story_counts_csv(user_mc_key, filename, topics_id, **kwargs): results = apicache.topic_split_story_counts(user_mc_key, topics_id, **kwargs) clean_results = [{'date': trimSolrDate(item['date']), 'stories': item['count']} for item in results['counts']] sorted_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'stories'] return csv.stream_response(sorted_results, props, filename)