示例#1
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                      snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    try:
        focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    except ValueError:
        return json_error_response('Invalid Focal Set Id')
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
def collection_update(collection_id):
    user_mc = user_admin_mediacloud_client()
    label = '{}'.format(request.form['name'])
    description = request.form['description']
    static = request.form['static'] if 'static' in request.form else None
    show_on_stories = request.form['showOnStories'] if 'showOnStories' in request.form else None
    show_on_media = request.form['showOnMedia'] if 'showOnMedia' in request.form else None

    formatted_name = format_name_from_label(label)

    source_ids = []
    if len(request.form['sources[]']) > 0:
        source_ids = [int(sid) for sid in request.form['sources[]'].split(',')]
    # first update the collection
    updated_collection = user_mc.updateTag(collection_id, formatted_name, label, description,
                                           is_static=(static == 'true'),
                                           show_on_stories=(show_on_stories == 'true'),
                                           show_on_media=(show_on_media == 'true'))
    # get the sources in the collection first, then remove and add as needed
    existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)]
    source_ids_to_remove = list(set(existing_source_ids) - set(source_ids))
    source_ids_to_add = [sid for sid in source_ids if sid not in existing_source_ids]
    # logger.debug(existing_source_ids)
    # logger.debug(source_ids_to_add)
    # logger.debug(source_ids_to_remove)
    # then go through and tag all the sources specified with the new collection id
    tags_to_add = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in source_ids_to_add]
    tags_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove]
    tags = tags_to_add + tags_to_remove
    if len(tags) > 0:
        user_mc.tagMedia(tags)
        apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id)
    return jsonify(updated_collection['tag'])
示例#3
0
def get_top_themes_by_story_tag_counts(topics_id, num_themes):
    user_mc_key = user_mediacloud_key()
    nyt_counts = []

    #get overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])

    # get the top themes by the story counts iwth overall timespan
    top_nyt_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, NYT_LABELS_TAG_SET_ID,
                                            TAG_COUNT_SAMPLE_SIZE, timespan_query)
    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']

    top_nyt_tags = top_nyt_tags[:num_themes]
    # for each country, set up the requisite info for UI
    for tag in top_nyt_tags:
        nyt_counts.append({
            'label': tag['label'],
            'geo_tag': tag['tag'],
            'tags_id': tag['tags_id'],
            'count': tag['count'],
            'pct': float(tag['count']) / float(total_stories), #story_tag_count / total story per topic count
        })

    return nyt_counts
示例#4
0
def topic_words(topics_id):
    sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE

    if access_public_topic(topics_id):
        results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size,
                                             snapshots_id=None, timespans_id=None, foci_id=None, q=None)
    elif is_user_logged_in():
        # grab the top words, respecting all the filters
        results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    totals = []  # important so that these get reset on the client when they aren't requested
    logger.debug(request.args)
    if (is_user_logged_in()) and ('withTotals' in request.args) and (request.args['withTotals'] == "true"):
        # return along with the results for the overall timespan, to facilitate comparison
        snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
        overall_timespan = _find_overall_timespan(topics_id, snapshots_id)
        totals = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size,
                                            timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None)

    response = {
        'list': results[:WORD_COUNT_UI_NUM_WORDS],
        'totals': totals[:WORD_COUNT_UI_NUM_WORDS],
        'sample_size': str(sample_size)
    }
    return jsonify(response)
def get_top_countries_by_story_tag_counts(topics_id, num_countries):
    tag_country_counts = []

    # get the total stories for a topic
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']

    # get the top countries by the story tag counts iwth overall timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id)
    overall_timespan = [t for t in timespans if t['period'] == "overall"]
    overall_timespan = next(iter(overall_timespan))
    timespan_query = "timespans_id:{}".format(overall_timespan['timespans_id'])
    top_geo_tags = _cached_topic_tag_counts(user_mediacloud_key(), topics_id, GEO_TAG_SET, GEO_SAMPLE_SIZE, timespan_query)
    
    # make sure the geo tag is in the geo_tags whitelist (is a country)
    country_tag_counts = [r for r in top_geo_tags if
                          int(r['tag'].split('_')[1]) in list(COUNTRY_GEONAMES_ID_TO_APLHA3.keys())]
    country_tag_counts = country_tag_counts[:num_countries]

    # for each country, set up the requisite info for UI
    for tag in country_tag_counts:
        tag_country_counts.append({
            'label': tag['label'],
            'geo_tag': tag['tag'],
            'tags_id': tag['tags_id'],
            'count': tag['count'],
            'pct': float(tag['count']) / float(total_stories),  # story_tag_count / total story per topic count
        })
    return tag_country_counts
示例#6
0
def media_type_coverage(topics_id):
    media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']
    # count the stories in any media in tagged as media_type
    tags_ids = " ".join(str(tag['tags_id']) for tag in media_type_tags)
    query_clause = "tags_id_media:({})".format(tags_ids)
    tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count']
    return jsonify({'counts': {'count': tagged_story_count, 'total': total_stories}})
示例#7
0
def _public_safe_topic_story_count(topics_id, q):
    if access_public_topic(topics_id):
        total = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(None))
        # force a count with just the query
        matching = apicache.topic_story_count(TOOL_API_KEY, topics_id, q=apicache.add_to_user_query(q))
    elif is_user_logged_in():
        total = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(None))
        # force a count with just the query
        matching = apicache.topic_story_count(user_mediacloud_key(), topics_id, q=apicache.add_to_user_query(q))
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    return jsonify({'counts': {'count': matching['count'], 'total': total['count']}})
def api_collection_details(collection_id):
    add_in_sources = False
    if ('getSources' in request.args) and (request.args['getSources'] == 'true'):
        add_in_sources = True

    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    add_user_favorite_flag_to_collections([info])
    info['id'] = collection_id
    info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id'])
    if add_in_sources:
        media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
        info['sources'] = media_in_collection
    analytics_db.increment_count(analytics_db.TYPE_COLLECTION, collection_id, analytics_db.ACTION_SOURCE_MGR_VIEW)
    return jsonify({'results': info})
def api_collection_source_representation_csv(collection_id):
    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id)
    props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct']
    filename = info['label'] + "-source sentence counts.csv"
    return csv.stream_response(source_representation, props, filename)
示例#10
0
def collection_wordcount_csv(collection_id):
    solr_q = 'tags_id_media:' + str(collection_id)
    solr_fq = None
    # add in the publish_date clause if there is one
    if ('q' in request.args) and (len(request.args['q']) > 0):
        solr_fq = request.args['q']
    return stream_wordcount_csv(user_mediacloud_key(), 'wordcounts-Collection-' + collection_id, solr_q, solr_fq)
示例#11
0
def api_collection_sources_csv(collection_id):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)    # not cached because props can change often
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag'])
    properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS
    return csv.download_media_csv(all_media, file_prefix, properties_to_include)
示例#12
0
def api_metadata_values(tag_sets_id):
    '''
    Source metadata is encoded in various tag sets - this returns the set and the list of
    available tags you can use
    '''
    data = tags_in_tag_set(user_mediacloud_key(), tag_sets_id, False, True)  # use the file-based cache here
    return jsonify(data)
示例#13
0
def _find_overall_timespan(topics_id, snapshots_id):
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                      snapshots_id=snapshots_id)
    for timespan in selected_snapshot_timespans:
        if timespan['period'] == 'overall':
            return timespan
    raise RuntimeError('Missing overall timespan in snapshot {} (topic {})!'.format(snapshots_id, topics_id))
示例#14
0
def media(topics_id, media_id):
    user_mc = user_admin_mediacloud_client()
    combined_media_info = apicache.topic_media_list(user_mediacloud_key(), topics_id, media_id=media_id)['media'][0]
    media_info = user_mc.media(media_id)
    for key in list(media_info.keys()):
        if key not in list(combined_media_info.keys()):
            combined_media_info[key] = media_info[key]
    return jsonify(combined_media_info)
示例#15
0
def media_words_csv(topics_id, media_id):
    query = apicache.add_to_user_query('media_id:'+media_id)
    ngram_size = request.args['ngram_size'] if 'ngram_size' in request.args else 1  # default to word count
    word_counts = apicache.topic_ngram_counts(user_mediacloud_key(), topics_id, ngram_size=ngram_size, q=query,
                                              num_words=WORD_COUNT_DOWNLOAD_NUM_WORDS,
                                              sample_size=WORD_COUNT_DOWNLOAD_SAMPLE_SIZE)
    return csv.stream_response(word_counts, apicache.WORD_COUNT_DOWNLOAD_COLUMNS,
                               'topic-{}-media-{}-sampled-ngrams-{}-word'.format(topics_id, media_id, ngram_size))
示例#16
0
def media_type_story_counts(topics_id):
    tag_story_counts = []
    media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE)
    # grab the total stories
    total_stories = topic_story_count(user_mediacloud_key(), topics_id)['count']
    # make a count for each tag based on media_id
    for tag in media_type_tags:
        query_clause = "tags_id_media:{}".format(tag['tags_id'])
        tagged_story_count = topic_story_count(user_mediacloud_key(), topics_id, q=query_clause)['count']
        tag_story_counts.append({
            'label': tag['label'],
            'tags_id': tag['tags_id'],
            'count': tagged_story_count,
            'pct': float(tagged_story_count)/float(total_stories)
        })

    return jsonify({'story_counts': tag_story_counts})
示例#17
0
def _collection_source_story_split_historical_counts(collection_id):
    media_list = media_with_tag(user_mediacloud_key(), collection_id)
    jobs = [{'media': m} for m in media_list]
    # fetch in parallel to make things faster
    pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE)
    results = pool.map(_source_story_split_count_worker, jobs)  # blocks until they are all done
    pool.terminate()  # extra safe garbage collection
    return results
示例#18
0
def topic_media(topics_id):
    if access_public_topic(topics_id):
        media_list = topic_media_list(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None, sort=None, limit=None, link_id=None)
    elif is_user_logged_in():
        media_list = topic_media_list(user_mediacloud_key(), topics_id)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    return jsonify(media_list)
def cached_media_tags(tag_sets_id):
    partisanship_tags = cached_tags_in_tag_set(tag_sets_id)
    for tag in partisanship_tags:
        media = media_with_tag(user_mediacloud_key(), tag['tags_id'], True)  # cache this list
        media_ids = [str(m['media_id']) for m in media] # as strs so we can concat into a query str later with .join call
        tag['media'] = media
        tag['media_ids'] = media_ids
        tag['media_query'] = "media_id:({})".format(" ".join(media_ids))
    return partisanship_tags
示例#20
0
def _find_overall_timespan(topics_id, snapshots_id):
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(
        user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    for timespan in selected_snapshot_timespans:
        if timespan['period'] == 'overall':
            return timespan
    raise RuntimeError(
        'Missing overall timespan in snapshot {} (topic {})!'.format(
            snapshots_id, topics_id))
示例#21
0
def media_source_words(media_id):
    solr_q = 'media_id:'+str(media_id)
    solr_fq = None
    if ('q' in request.args) and (len(request.args['q']) > 0):
        solr_fq = request.args['q']
    info = {
        'wordcounts': word_count(user_mediacloud_key(), solr_q, solr_fq)
    }
    return jsonify({'results': info})
def remove_sources_from_collection(collection_id):
    source_ids_to_remove = request.form['sources[]'].split(',')
    source_ids_to_remove = [int(s) for s in source_ids_to_remove]
    user_mc = user_admin_mediacloud_client()
    # get the sources in the collection first, then remove and add as needed
    existing_source_ids = [int(m['media_id']) for m in media_with_tag(user_mediacloud_key(), collection_id)]
    source_ids_to_remain = list(set(existing_source_ids) - set(source_ids_to_remove))

    media_to_remove = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_REMOVE) for sid in source_ids_to_remove]
    media_to_remain = [MediaTag(sid, tags_id=collection_id, action=TAG_ACTION_ADD) for sid in
                       source_ids_to_remain]  # do I need to run similar or TAG_ACTION_REMOVE?
    current_media = media_to_remove + media_to_remain

    if len(current_media) > 0:
        results = user_mc.tagMedia(current_media)

    apicache.invalidate_collection_source_representation_cache(user_mediacloud_key(), collection_id)
    return jsonify(results)
示例#23
0
def topic_split_story_count(topics_id):
    if access_public_topic(topics_id):
        results = apicache.topic_split_story_counts(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None,q=None)
    elif is_user_logged_in():
        results = apicache.topic_split_story_counts(user_mediacloud_key(), topics_id)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    return jsonify({'results': results})
示例#24
0
def topic_split_story_count(topics_id):
    if access_public_topic(topics_id):
        results = apicache.topic_split_story_counts(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None,q=None)
    elif is_user_logged_in():
        results = apicache.topic_split_story_counts(user_mediacloud_key(), topics_id)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    return jsonify({'results': results})
示例#25
0
def media(topics_id, media_id):
    user_mc = user_admin_mediacloud_client()
    combined_media_info = apicache.topic_media_list(
        user_mediacloud_key(), topics_id, media_id=media_id)['media'][0]
    media_info = user_mc.media(media_id)
    for key in media_info.keys():
        if key not in combined_media_info.keys():
            combined_media_info[key] = media_info[key]
    return jsonify(combined_media_info)
示例#26
0
def api_metadata_values(tag_sets_id):
    '''
    Source metadata is encoded in various tag sets - this returns the set and the list of
    available tags you can use
    '''
    data = tags_in_tag_set(user_mediacloud_key(), tag_sets_id, False,
                           True)  # use the file-based cache here
    data['short_list'] = get_metadata_defaults(tag_sets_id)
    return jsonify(data)
示例#27
0
def _collection_source_story_split_historical_counts(collection_id):
    media_list = media_with_tag(user_mediacloud_key(), collection_id)
    jobs = [{'media': m} for m in media_list]
    # fetch in parallel to make things faster
    pool = Pool(processes=HISTORICAL_COUNT_POOL_SIZE)
    results = pool.map(_source_story_split_count_worker,
                       jobs)  # blocks until they are all done
    pool.terminate()  # extra safe garbage collection
    return results
示例#28
0
def api_collection_sources(collection_id):
    int(collection_id)
    results = {
        'tags_id': collection_id
    }
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    add_user_favorite_flag_to_sources(media_in_collection)
    results['sources'] = media_in_collection
    return jsonify(results)
示例#29
0
def _cached_tag_coverage_pct(query, tag_sets_id):
    user_mc = user_mediacloud_client()
    story_count = source_story_count(user_mediacloud_key(), query)
    tagged_story_counts = user_mc.storyTagCount(solr_query=query, tag_sets_id=tag_sets_id)
    # sum tagged articles because there are different versions
    tagged_sum = sum([tag_info['count'] for tag_info in tagged_story_counts])
    # compute coverage ratio (protect against div by zero)
    ratio = float(tagged_sum) / float(story_count) if story_count > 0 else 0
    return ratio
示例#30
0
def collection_wordcount_csv(collection_id):
    solr_q = 'tags_id_media:' + str(collection_id)
    solr_fq = None
    # add in the publish_date clause if there is one
    if ('q' in request.args) and (len(request.args['q']) > 0):
        solr_fq = request.args['q']
    return stream_wordcount_csv(user_mediacloud_key(),
                                'wordcounts-Collection-' + collection_id,
                                solr_q, solr_fq)
示例#31
0
def media_stories(topics_id, media_id):
    sort = validated_sort(request.args.get('sort'))
    limit = request.args.get('limit')
    stories = apicache.topic_story_list(user_mediacloud_key(),
                                        topics_id,
                                        media_id=media_id,
                                        sort=sort,
                                        limit=limit)
    return jsonify(stories)
示例#32
0
def topic_timespan_list(topics_id, snapshots_id):
    ignored_snapshots_id, timespans_id, foci_id, q = filters_from_args(
        request.args)
    timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(),
                                                    topics_id, snapshots_id,
                                                    foci_id)
    # add the focal_set type to the timespan so we can use that in the client (ie. decide what to show or not
    # based on what type of focal_set this timespan is part of)
    focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(),
                                                topics_id, snapshots_id)
    for t in timespans:
        for fs in focal_sets:
            for f in fs['foci']:
                if f['foci_id'] == t['foci_id']:
                    t['focal_set'] = fs
                    t['focus'] = f
                    break
    return jsonify({'list': timespans})
示例#33
0
def api_collection_set(tag_sets_id):
    '''
    Return a list of all the (public only or public and private, depending on user role) collections in a tag set.  Not cached because this can change, and load time isn't terrible.
    :param tag_sets_id: the tag set to query for public collections
    :return: dict of info and list of collections in
    '''
    if user_has_auth_role(ROLE_MEDIA_EDIT):
        info = apicache.tag_set_with_private_collections(
            user_mediacloud_key(), tag_sets_id)
    else:
        info = apicache.tag_set_with_public_collections(
            user_mediacloud_key(), tag_sets_id)

    add_user_favorite_flag_to_collections(info['tags'])
    # rename to make more sense here
    info['collections'] = info['tags']
    del info['tags']
    return jsonify(info)
示例#34
0
def topic_words(topics_id):
    sample_size = request.args[
        'sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE

    if access_public_topic(topics_id):
        results = apicache.topic_word_counts(TOOL_API_KEY,
                                             topics_id,
                                             sample_size=sample_size,
                                             snapshots_id=None,
                                             timespans_id=None,
                                             foci_id=None,
                                             q=None)
    elif is_user_logged_in():
        # grab the top words, respecting all the filters
        results = apicache.topic_word_counts(user_mediacloud_key(),
                                             topics_id,
                                             sample_size=sample_size)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    totals = [
    ]  # important so that these get reset on the client when they aren't requested
    logger.debug(request.args)
    if (is_user_logged_in()) and ('withTotals' in request.args) and (
            request.args['withTotals'] == "true"):
        # return along with the results for the overall timespan, to facilitate comparison
        snapshots_id, timespans_id, foci_id, q = filters_from_args(
            request.args)
        overall_timespan = _find_overall_timespan(topics_id, snapshots_id)
        totals = apicache.topic_word_counts(
            user_mediacloud_key(),
            topics_id,
            sample_size=sample_size,
            timespans_id=overall_timespan['timespans_id'],
            foci_id=None,
            q=None)

    response = {
        'list': results[:WORD_COUNT_UI_NUM_WORDS],
        'totals': totals[:WORD_COUNT_UI_NUM_WORDS],
        'sample_size': str(sample_size)
    }
    return jsonify(response)
示例#35
0
def collection_source_split_stories(collection_id):
    collections_query = "tags_id_media:{}".format(collection_id)
    exclude_spidered_stories = " tags_id_media:{} AND NOT tags_id_stories:{}".format(str(collection_id), 8875452)\
        if 'separate_spidered' in request.args else collections_query
    interval = 'day'  # default, and not currently passed to the calls above

    all_results = apicache.last_year_split_story_count(user_mediacloud_key(), collections_query)
    # same if request.args doesn't ask to exclude_spidered
    non_spidered_results = apicache.last_year_split_story_count(user_mediacloud_key(), exclude_spidered_stories)

    all_stories = {
        'total_story_count': all_results['total_story_count'],
        'list': all_results['counts'],
    }
    partial_stories = {
        'total_story_count': non_spidered_results['total_story_count'],
        'list': non_spidered_results['counts'],
    }
    return jsonify({'results': {'all_stories': all_stories, 'partial_stories': partial_stories, 'interval': interval}})
示例#36
0
def api_collection_set(tag_sets_id):
    """
    Return a list of all the (public only or public and private, depending on user role) collections in a tag set.
    Not cached because this can change, and load time isn't terrible.
    :param tag_sets_id: the tag set to query for public collections
    :return: dict of info and list of collections in
    """
    if user_has_auth_role(ROLE_MEDIA_EDIT):
        info = apicache.tag_set_with_private_collections(user_mediacloud_key(), tag_sets_id)
    else:
        info = apicache.tag_set_with_public_collections(user_mediacloud_key(), tag_sets_id)

    add_user_favorite_flag_to_collections(info['tags'])
    # rename to make more sense here
    for t in info['tags']:
        t['sort_key'] = t['label'] if t['label'] else t['tag']
    info['collections'] = sorted(info['tags'], key=itemgetter('sort_key'))
    del info['tags']
    return jsonify(info)
示例#37
0
def api_collections_by_ids():
    collection_ids = request.args['coll[]'].split(',')
    sources_list = []
    for tags_id in collection_ids:
        all_media = media_with_tag(user_mediacloud_key(), tags_id)
        info = [{'media_id': m['media_id'], 'name': m['name'], 'url': m['url'], 'public_notes': m['public_notes']} for m
                in all_media]
        add_user_favorite_flag_to_sources(info)
        sources_list += info
    return jsonify({'results': sources_list})
示例#38
0
def topic_stories(topics_id):
    local_mc = None
    if access_public_topic(topics_id):
        stories = topic_story_list(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None, foci_id=None, q=None)
    elif is_user_logged_in():
        stories = topic_story_list(user_mediacloud_key(), topics_id)
    else:
        return jsonify({'status':'Error', 'message': 'Invalid attempt'})

    return jsonify(stories)
示例#39
0
def topic_stories_csv(topics_id):
    as_attachment = True
    fb_data = False
    if ('attach' in request.args):
        as_attachment = request.args['attach'] == 1
    if ('fbData' in request.args):
        fb_data = int(request.args['fbData']) == 1
    user_mc = user_admin_mediacloud_client()
    topic = user_mc.topic(topics_id)
    return stream_story_list_csv(user_mediacloud_key(), topic['name']+'-stories', topics_id, as_attachment=as_attachment, fb_data=fb_data)
示例#40
0
def _cached_tag_coverage_pct(query, tag_sets_id):
    user_mc = user_mediacloud_client()
    story_count = source_story_count(user_mediacloud_key(), query)
    tagged_story_counts = user_mc.storyTagCount(solr_query=query,
                                                tag_sets_id=tag_sets_id)
    # sum tagged articles because there are different versions
    tagged_sum = sum([tag_info['count'] for tag_info in tagged_story_counts])
    # compute coverage ratio (protect against div by zero)
    ratio = float(tagged_sum) / float(story_count) if story_count > 0 else 0
    return ratio
示例#41
0
def retweet_partisanship_story_counts(topics_id):
    # TODO: add in overall timespan id here so it works in different snapshots
    tag_story_counts = []
    partisanship_tags = _cached_media_tags(
        TAG_SETS_ID_RETWEET_PARTISANSHIP_2016)
    # grab the total stories
    try:
        total_stories = topic_story_count(user_mediacloud_key(),
                                          topics_id)['count']
    except mediacloud.error.MCException:
        total_stories = 0
    # make a count for each tag
    for tag in partisanship_tags:
        try:
            tagged_story_count = topic_story_count(user_mediacloud_key(),
                                                   topics_id,
                                                   q=tag['query'])['count']
            pct = float(tagged_story_count) / float(total_stories)
        except ZeroDivisionError:
            tagged_story_count = 0
            pct = 0
        except mediacloud.error.MCException:
            tagged_story_count = 0
            pct = 0
        tag_story_counts.append({
            'label': tag['label'],
            'tags_id': tag['tags_id'],
            'count': tagged_story_count,
            'pct': pct
        })
    # order them in the way a person would expect ( left to center to right)
    ordered_tag_story_counts = list()
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360520][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360521][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360522][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360523][0])
    ordered_tag_story_counts.append(
        [t for t in tag_story_counts if t['tags_id'] == 9360524][0])
    return jsonify({'story_counts': ordered_tag_story_counts})
示例#42
0
def base_snapshot_timespan(topics_id):
    # find the timespan matching this one in the base snapshot (ie. with no foci_id)
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    base_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                  snapshots_id=snapshots_id, foci_id=None)
    timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id)  # the selected timespan
    for t in base_snapshot_timespans:
        if apicache.is_timespans_match(timespan, t):
            return t
    raise ValueError("Can't find a timespan in the base snapshot matching the one specified")
示例#43
0
def topic_media(topics_id):
    if access_public_topic(topics_id):
        media_list = apicache.topic_media_list(TOOL_API_KEY, topics_id, snapshots_id=None, timespans_id=None,
                                               foci_id=None, sort=None, limit=None, link_id=None)
    elif is_user_logged_in():
        media_list = apicache.topic_media_list(user_mediacloud_key(), topics_id)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    return jsonify(media_list)
示例#44
0
def api_collection_sources_csv(collection_id):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(
        collection_id)  # not cached because props can change often
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    file_prefix = "Collection {} ({}) - sources ".format(
        collection_id, collection['tag'])
    properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS
    return csv.download_media_csv(all_media, file_prefix,
                                  properties_to_include)
示例#45
0
def api_collections_by_ids():
    collection_ids = request.args['coll[]'].split(',')
    sources_list = []
    for tags_id in collection_ids:
        all_media = media_with_tag(user_mediacloud_key(), tags_id)
        info = [{'media_id': m['media_id'], 'name': m['name'], 'url': m['url'], 'public_notes': m['public_notes']} for m
                in all_media]
        add_user_favorite_flag_to_sources(info)
        sources_list += info
    return jsonify({'results': sources_list})
示例#46
0
def story_counts(topics_id):
    if access_public_topic(topics_id):
        local_key = TOOL_API_KEY
    elif is_user_logged_in():
        local_key = user_mediacloud_key()
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    total = topic_story_count(local_key, topics_id, timespans_id=None, q=None)
    filtered = topic_story_count(local_key, topics_id)  # force a count with just the query
    return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
示例#47
0
def url_sharing_focal_set(topics_id, snapshots_id) -> bool:
    """
    Return the focal_set that is marked as the auto-generated "URL Sharing" one.
    :param topics_id:
    :param snapshots_id:
    :return: a focal set, or None if the topic doesn't have one
    """
    focal_sets = topic_focal_sets_list(user_mediacloud_key(), topics_id, snapshots_id)
    url_sharing_focal_sets = [fs for fs in focal_sets if is_url_sharing_focal_set(fs)]
    return url_sharing_focal_sets[0] if len(url_sharing_focal_sets) > 0 else None
示例#48
0
def story_counts(topics_id):
    if access_public_topic(topics_id):
        local_key = TOOL_API_KEY
    elif is_user_logged_in():
        local_key = user_mediacloud_key()
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
    total = apicache.topic_story_count(local_key, topics_id, timespans_id=None, snapshots_id=None, q=None, foci_id=None)
    filtered = apicache.topic_story_count(local_key, topics_id)
    return jsonify({'counts': {'count': filtered['count'], 'total': total['count']}})
示例#49
0
def api_collection_sources_csv(collection_id):
    user_mc = user_admin_mediacloud_client()
    # info = user_mc.tag(int(collection_id))
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    for src in all_media:
        for tag in src['media_source_tags']:
            if is_metadata_tag_set(tag['tag_sets_id']):
                format_metadata_fields(src, tag['tag_sets_id'], tag['tag'])
    file_prefix = "Collection_Sourcelist_Template_for_" + collection_id + "_"
    what_type_download = COLLECTIONS_TEMPLATE_PROPS_EDIT
    return csv.download_media_csv(all_media, file_prefix, what_type_download)
示例#50
0
def api_media_source_split_stories(media_id):
    media_query = 'media_id:' + str(media_id)
    exclude_spidered_stories = " media_id:{} AND NOT tags_id_stories:{}".format(str(media_id), 8875452) if 'separate_spidered' in request.args else media_query

    health = _cached_media_source_health(user_mediacloud_key(), media_id)

    all_results = apicache.last_year_split_story_count(user_mediacloud_key(), media_query)
    non_spidered_results = apicache.last_year_split_story_count(user_mediacloud_key(), exclude_spidered_stories) #same if request.args doesn't ask to exclude_spidered

    all_stories = {
        'total_story_count' : all_results['total_story_count'],
        'health': health,
        'list': all_results['counts'],
    }
    partial_stories = {
        'total_story_count': non_spidered_results['total_story_count'],
        'health': health,
        'list': non_spidered_results['counts'],
    }
    return jsonify({'results': {'all_stories':all_stories, 'partial_stories': partial_stories}})
示例#51
0
def topic_media_csv(topics_id):
    sort = validated_sort(request.args.get('sort'))
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    return _stream_media_list_csv(user_mediacloud_key(),
                                  'media-for-topic-' + topics_id,
                                  topics_id,
                                  sort=sort,
                                  snapshots_id=snapshots_id,
                                  timespans_id=timespans_id,
                                  foci_id=foci_id,
                                  q=q)
示例#52
0
def collection_source_split_stories(collection_id):
    q = "tags_id_media:{}".format(collection_id)
    results = apicache.last_year_split_story_count(user_mediacloud_key(), q)
    interval = 'day'  # default, and not currently passed to the calls above
    return jsonify({
        'results': {
            'list': results['counts'],
            'total_story_count': results['total_story_count'],
            'interval': interval
        }
    })
示例#53
0
def topic_focal_set_list(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    include_story_counts = request.args.get('includeStoryCounts')
    focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(),
                                                topics_id, snapshots_id)
    # now mark the ones that are the magically added URL sharing platform ones
    for fs in focal_sets:
        fs['is_url_sharing'] = is_url_sharing_focal_set(fs)
    if include_story_counts and (include_story_counts == u'1'):
        _add_story_counts_to_foci(topics_id, focal_sets)
    return jsonify(focal_sets)
示例#54
0
def _generate_network_of_frames(topics_id, timespans_id, num_of_sources, out_name, top_media_sort,
                                remove_media_list=None, remove_word_list=[], generate_word_lists=False,
                                include_media_list=None, media_attribs=None, num_words=None):
    
    if remove_media_list is None:
        remove_media_list = []

# use this specify attributes on the media source that should be added to the node as attributes        
#     if(media_attribs == None):
#         media_attribs = {}
        
    if include_media_list is None:
        media_sources_md = topic_media_list(user_mediacloud_key(), topics_id, timespans_id=timespans_id,
                                 limit=num_of_sources + len(remove_media_list), sort=top_media_sort)['media']
    else:
        media_sources_md = include_media_list

    if remove_media_list is not None:
        for r in remove_media_list:
            media_sources_md = _remove_media_source(r, media_sources_md)

    top_words = _build_top_words(media_sources_md, topics_id, timespans_id, remove_word_list, num_words)
    if remove_word_list is not None:
        top_words = _clean_top_words(top_words, remove_word_list)

    frame_network = _build_network(top_words, media_sources_md, media_attribs)

    _export_gexf_network(frame_network, '%s.gexf' % out_name)
    _export_d3_network(frame_network, '%s' % out_name)
    
    if generate_word_lists:
        with open('%s.txt' % out_name, 'w', encoding="utf-8") as wl:
            all_words = []
            media_sources = {ms['media_id']: ms['name'] for ms in media_sources_md}
            # counts = {}
            for ms in top_words:
                # wl.write("\n\n%s (media id: %d):\n" % (media_sources[ms].encode('ascii', 'ignore'), ms))
                wl.write("\n\n{} (media id: {}):\n".format(media_sources[ms], ms))
                for w in top_words[ms]:
                    all_words.append(w['term'])

                    # increment count to see how many media source include each word
                    # counts[ms]

                    # wl.write("- %s (%d)\n" % (w['term'].encode('ascii', 'ignore'), w['count']))
                    wl.write("- {} ({})\n".format(w['term'], w['count']))
                wl.write("\n")
    
    linefeed = chr(10)  # linefeed=\n
    s = linefeed.join(nx.generate_gexf(frame_network))  # doctest: +SKIP
    # for line in nx.generate_gexf(frame_network):  # doctest: +SKIP
    #     logger.debug line

    return s
示例#55
0
def topic_tag_coverage(topics_id, tags_id):
    '''
    Useful for seeing how many stories in the topic are tagged with a specific tag
    '''
    if isinstance(tags_id, list):   # doesn't repect duck-typing, but quick fix
        tags_id_str = "({})".format(" ".join([str(tid) for tid in tags_id]))
    else:
        tags_id_str = str(tags_id)
    # respect any query filter the user has set
    query_with_tag = add_to_user_query("tags_id_stories:{}".format(tags_id_str))
    # now get the counts
    if access_public_topic(topics_id):
        total = topic_story_count(TOOL_API_KEY, topics_id)
        tagged = topic_story_count(TOOL_API_KEY, topics_id, q=query_with_tag)  # force a count with just the query
    elif is_user_logged_in():
        total = topic_story_count(user_mediacloud_key(), topics_id)
        tagged = topic_story_count(user_mediacloud_key(), topics_id, q=query_with_tag)   # force a count with just the query
    else:
        return None
    return {'counts': {'count': tagged['count'], 'total': total['count']}}
示例#56
0
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # need the timespan info, to find the appropriate timespan with each focus
    try:
        base_timespan = base_snapshot_timespan(topics_id)
        focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id,
                                             snapshots_id, focal_sets_id)
    except ValueError as e:
        return json_error_response(e.message)
    # collect the story split counts for each foci
    timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan,
                                                    focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_split_story_counts(
            user_mediacloud_key(),
            topics_id,
            snapshots_id=snapshots_id,
            timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['split_story_counts'] = data
    return jsonify(focal_set)
示例#57
0
def api_collection_source_representation_csv(collection_id):
    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    source_representation = apicache.collection_source_representation(
        user_mediacloud_key(), collection_id)
    props = [
        'media_id', 'media_name', 'media_url', 'stories', 'sample_size',
        'story_pct'
    ]
    filename = info['label'] + "-source sentence counts.csv"
    return csv.stream_response(source_representation, props, filename)
示例#58
0
def topic_tag_coverage(topics_id, tags_id):
    '''
    Useful for seeing how many stories in the topic are tagged with a specific tag
    '''
    # respect any query filter the user has set
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    query_with_tag = add_to_user_query("tags_id_stories:{}".format(tags_id))
    # now get the counts
    if access_public_topic(topics_id):
        total = topic_story_count(TOOL_API_KEY, topics_id)
        tagged = topic_story_count(
            TOOL_API_KEY, topics_id,
            q=query_with_tag)  # force a count with just the query
    elif is_user_logged_in():
        total = topic_story_count(user_mediacloud_key(), topics_id)
        tagged = topic_story_count(
            user_mediacloud_key(), topics_id,
            q=query_with_tag)  # force a count with just the query
    else:
        return None
    return {'counts': {'count': tagged['count'], 'total': total['count']}}
示例#59
0
def _source_story_split_count_job(info):
    source = info['media']
    q = "media_id:{}".format(source['media_id'])
    split_stories = apicache.split_story_count(user_mediacloud_key(), q, 360)
    source_data = {
        'media_id': source['media_id'],
        'media_name': source['name'],
        'media_url': source['url'],
        'total_story_count': split_stories['total_story_count'],
        'splits_over_time': split_stories['counts'],
    }
    return source_data
示例#60
0
def get_topic_story_links_csv(topics_id):
    user_mc = user_mediacloud_client()
    topic = user_mc.topic(topics_id)
    #page through results for timespand
    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated',
        'themes', 'subtopics',
        'inlink_count', 'facebook_share_count', 'outlink_count', 'media_inlink_count',
        'media_id', 'media_name', 'media_url',
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]
    return stream_story_link_list_csv(user_mediacloud_key(), topic['name'] + '-stories', topics_id)