Exemplo n.º 1
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
Exemplo n.º 2
0
def map_files(topics_id):
    files = { 
        'wordMap': 'unsupported',
        'linkMap': 'not rendered'
    }

    if access_public_topic(topics_id) or is_user_logged_in():
        snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
        map_type = MAP_TYPES[0]  # no linkMaps yet
        prefix = _get_file_prefix(map_type, topics_id, timespans_id)
        lock_filename = prefix+".lock"
        rendered_filename = prefix+".gexf"
        # check if rendered file is there
        is_rendered = os.path.isfile(os.path.join(DATA_DIR, rendered_filename))
        # logger.warn(os.path.join(DATA_DIR,rendered_filename))
        # logger.warn(is_rendered)
        if is_rendered:
            status = 'rendered'
        else:
            lockfile_path = os.path.join(DATA_DIR, lock_filename)
            is_generating = os.path.isfile(lockfile_path)
            if not is_generating:
                status = 'starting'
                _start_generating_map_file(map_type, topics_id, timespans_id)
            else:
                status = 'generating'
        files[map_type] = status
        return jsonify(files)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
Exemplo n.º 3
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                      snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    try:
        focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    except ValueError:
        return json_error_response('Invalid Focal Set Id')
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
def topic_split_story_counts(user_mc_key, topics_id, **kwargs):
    '''
    Return setence counts over timebased on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    timespan = topic_timespan(topics_id, snapshots_id, foci_id, timespans_id)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'fq': timespan['fq']
    }
    merged_args.update(
        kwargs
    )  # passed in args override anything pulled form the request.args
    # and make sure to ignore undateable stories
    undateable_query_part = "-(tags_id_stories:{})".format(
        STORY_UNDATEABLE_TAG)  # doesn't work if the query includes parens!!!
    if (merged_args['q'] is not None) and (len(merged_args['q']) > 0):
        merged_args['q'] = "(({}) AND {})".format(merged_args['q'],
                                                  undateable_query_part)
    else:
        merged_args['q'] = "* AND {}".format(undateable_query_part)
    results = _cached_topic_split_story_counts(user_mc_key, topics_id,
                                               **merged_args)
    results['counts'] = add_missing_dates_to_split_story_counts(
        results['counts'],
        datetime.strptime(timespan['start_date'],
                          mc.SENTENCE_PUBLISH_DATE_FORMAT),
        datetime.strptime(timespan['end_date'],
                          mc.SENTENCE_PUBLISH_DATE_FORMAT))
    return results
Exemplo n.º 5
0
def map_files(topics_id):
    files = {'wordMap': 'unsupported', 'linkMap': 'not rendered'}

    if access_public_topic(topics_id) or is_user_logged_in():
        snapshots_id, timespans_id, foci_id, q = filters_from_args(
            request.args)
        map_type = MAP_TYPES[0]  # no linkMaps yet
        status = None
        prefix = _get_file_prefix(map_type, topics_id, timespans_id)
        lock_filename = prefix + ".lock"
        rendered_filename = prefix + ".gexf"
        # check if rendered file is there
        is_rendered = os.path.isfile(os.path.join(DATA_DIR, rendered_filename))
        #logger.warn(os.path.join(DATA_DIR,rendered_filename))
        #logger.warn(is_rendered)
        if is_rendered:
            status = 'rendered'
        else:
            is_generating = os.path.isfile(
                os.path.join(DATA_DIR, lock_filename))
            if not is_generating:
                _start_generating_map_file(map_type, topics_id, timespans_id)
            status = 'generating'
        files[map_type] = status
        return jsonify(files)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})
Exemplo n.º 6
0
def topic_words(topics_id):
    sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE

    if access_public_topic(topics_id):
        results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size,
                                             snapshots_id=None, timespans_id=None, foci_id=None, q=None)
    elif is_user_logged_in():
        # grab the top words, respecting all the filters
        results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    totals = []  # important so that these get reset on the client when they aren't requested
    logger.debug(request.args)
    if (is_user_logged_in()) and ('withTotals' in request.args) and (request.args['withTotals'] == "true"):
        # return along with the results for the overall timespan, to facilitate comparison
        snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
        overall_timespan = _find_overall_timespan(topics_id, snapshots_id)
        totals = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size,
                                            timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None)

    response = {
        'list': results[:WORD_COUNT_UI_NUM_WORDS],
        'totals': totals[:WORD_COUNT_UI_NUM_WORDS],
        'sample_size': str(sample_size)
    }
    return jsonify(response)
Exemplo n.º 7
0
def topic_split_story_counts(user_mc_key, topics_id, **kwargs):
    '''
    Return setence counts over timebased on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    timespan = topic_timespan(topics_id, snapshots_id, foci_id, timespans_id)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'fq': timespan['fq']
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    # and make sure to ignore undateable stories
    undateable_query_part = "-(tags_id_stories:{})".format(STORY_UNDATEABLE_TAG)   # doesn't work if the query includes parens!!!
    if (merged_args['q'] is not None) and (len(merged_args['q']) > 0):
        merged_args['q'] = "(({}) AND {})".format(merged_args['q'], undateable_query_part)
    else:
        merged_args['q'] = "* AND {}".format(undateable_query_part)
    results = _cached_topic_split_story_counts(user_mc_key, topics_id, **merged_args)
    results['counts'] = add_missing_dates_to_split_story_counts(results['counts'],
                                                      datetime.strptime(timespan['start_date'], mc.SENTENCE_PUBLISH_DATE_FORMAT),
                                                      datetime.strptime(timespan['end_date'], mc.SENTENCE_PUBLISH_DATE_FORMAT))
    return results
Exemplo n.º 8
0
def topic_similar_words(topics_id, word):
    # no need for user-specific cache on this
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    results = _word2vec_topic_similar_words(topics_id, snapshots_id, [word])
    if len(results):
        return results[0]['results']
    return []
Exemplo n.º 9
0
def topic_similar_words(topics_id, word):
    # no need for user-specific cache on this
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    results = _word2vec_topic_similar_words(topics_id, snapshots_id, [word])
    if len(results):
        return results[0]['results']
    return []
Exemplo n.º 10
0
def topic_word_counts(user_mc_key, topics_id, **kwargs):
    '''
    Return sampled word counts based on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sample_size': WORD_COUNT_SAMPLE_SIZE,
        'num_words': WORD_COUNT_UI_NUM_WORDS
    }
    merged_args.update(
        kwargs
    )  # passed in args override anything pulled form the request.args
    word_data = _cached_topic_word_counts(user_mc_key, topics_id,
                                          **merged_args)
    words = [w['term'] for w in word_data]
    # and now add in word2vec model position data
    google_word2vec_data = _cached_word2vec_google_2d_results(words)
    for i in range(len(google_word2vec_data)):
        word_data[i]['google_w2v_x'] = google_word2vec_data[i]['x']
        word_data[i]['google_w2v_y'] = google_word2vec_data[i]['y']
    topic_word2vec_data = _word2vec_topic_2d_results(topics_id, snapshots_id,
                                                     words)
    for i in range(len(topic_word2vec_data)):
        word_data[i]['w2v_x'] = topic_word2vec_data[i]['x']
        word_data[i]['w2v_y'] = topic_word2vec_data[i]['y']
    return word_data
Exemplo n.º 11
0
def topic_w2v_timespan_embeddings(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # Retrieve embeddings for overall topic
    overall_word_counts = apicache.topic_word_counts(user_mediacloud_key(),
                                                     topics_id,
                                                     num_words=50,
                                                     snapshots_id=snapshots_id,
                                                     timespans_id=None,
                                                     foci_id=foci_id,
                                                     q=q)
    overall_words = [x['term'] for x in overall_word_counts]
    overall_embeddings = {
        x['term']: (x['google_w2v_x'], x['google_w2v_y'])
        for x in overall_word_counts
    }

    # Retrieve top words for each timespan
    timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(),
                                                    topics_id, snapshots_id,
                                                    foci_id)

    # Retrieve embeddings for each timespan
    jobs = [{
        'api_key': user_mediacloud_key(),
        'topics_id': topics_id,
        'snapshots_id': snapshots_id,
        'foci_id': foci_id,
        'overall_words': overall_words,
        'overall_embeddings': overall_embeddings,
        'q': q,
        'timespan': t,
    } for t in timespans]
    embeddings_by_timespan = _get_all_timespan_embeddings(jobs)
    return jsonify({'list': embeddings_by_timespan})
Exemplo n.º 12
0
def media_outlinks_csv(topics_id, media_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    return stream_story_list_csv(user_mediacloud_key(),
                                 'media-' + media_id + '-outlinks',
                                 topics_id,
                                 link_from_media_id=media_id,
                                 timespans_id=timespans_id,
                                 q=q)
Exemplo n.º 13
0
def stream_media_list_csv(user_mc_key, topic, filename, **kwargs):
    filename = topic['name'] + '-' + filename
    # we have to make a separate call to the media info if the user wants to inlcude the media metadata
    include_media_metadata = ('media_metadata'
                              in kwargs) and (kwargs['media_metadata'] == '1')
    # if the focusId is a URL Sharing subtopic, then we have platform-specific post/author/channel share counts
    include_platform_url_shares = kwargs[
        'include_platform_url_shares'] if 'include_platform_url_shares' in kwargs else False
    # if this topic includes platforms, then we have URL sharing counts (post/author/channel) for each platform
    include_all_url_shares = kwargs[
        'include_all_url_shares'] if 'include_all_url_shares' in kwargs else False
    params = kwargs.copy()
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'timespans_id': timespans_id,
        'snapshots_id': snapshots_id,
        'foci_id': foci_id,
        'q': q,
        'sort': request.args.get('sort') if 'sort' in request.args else None,
    }
    params.update(merged_args)
    # do a check to see if the user has added in a real query or not
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [
            None, '', 'null', 'undefined'
        ] else None
    params[
        'limit'] = 1000  # an arbitrary value to let us page through with big topics (note, this is the page size)
    # set up the dict keys / column headers that the user cares about for this download
    props = TOPIC_MEDIA_CSV_PROPS
    if include_platform_url_shares:
        props += ['post_count', 'channel_count', 'author_count']
    if include_all_url_shares:
        # if the user requested to download all the url sharing counts by platform, we need to grab the config for that
        # which is held in the platform seed query objects
        topic_seed_queries = topic['topic_seed_queries']
        extra_columns = []
        for tsq in topic_seed_queries:
            prefix = platform_csv_column_header_prefix(tsq)
            extra_columns += [
                prefix + 'post_count', prefix + 'channel_count',
                prefix + 'author_count'
            ]
        props += extra_columns
        params['topic_seed_queries'] = topic_seed_queries
    if include_media_metadata:
        props += [
            'media_pub_country', 'media_pub_state', 'media_language',
            'media_about_country', 'media_media_type'
        ]
    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_stream_media_by_page(user_mc_key, topic['topics_id'],
                                          props, **params),
                    mimetype='text/csv; charset=utf-8',
                    headers=headers)
Exemplo n.º 14
0
def base_snapshot_timespan(topics_id):
    # find the timespan matching this one in the base snapshot (ie. with no foci_id)
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    base_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                  snapshots_id=snapshots_id, foci_id=None)
    timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id, timespans_id)  # the selected timespan
    for t in base_snapshot_timespans:
        if apicache.is_timespans_match(timespan, t):
            return t
    raise ValueError("Can't find a timespan in the base snapshot matching the one specified")
Exemplo n.º 15
0
def topic_focal_set_sentences_compare(topics_id, focal_sets_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    all_focal_sets = topic_focal_sets(user_mediacloud_key(), topics_id,
                                      snapshots_id)
    # need the timespan info, to find the appropriate timespan with each focus
    base_snapshot_timespans = cached_topic_timespan_list(
        user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    # if they have a focus selected, we need to find the appropriate overall timespan
    if foci_id is not None:
        timespan = topic_timespan(topics_id, snapshots_id, foci_id,
                                  timespans_id)
        for t in base_snapshot_timespans:
            if timespans_match(timespan, t):
                base_timespan = t
    else:
        base_timespan = None
        for t in base_snapshot_timespans:
            if t['timespans_id'] == int(timespans_id):
                base_timespan = t
                logger.info('base timespan = %s', timespans_id)
    if base_timespan is None:
        return json_error_response("Couldn't find the timespan you specified")
    # iterate through to find the one of interest
    focal_set = None
    for fs in all_focal_sets:
        if int(fs['focal_sets_id']) == int(focal_sets_id):
            focal_set = fs
    if focal_set is None:
        return json_error_response('Invalid Focal Set Id')
    # collect the sentence counts for each foci
    for focus in focal_set['foci']:
        # find the matching timespan within this focus
        snapshot_timespans = cached_topic_timespan_list(
            user_mediacloud_key(),
            topics_id,
            snapshots_id=snapshots_id,
            foci_id=focus['foci_id'])
        timespan = None
        for t in snapshot_timespans:
            if timespans_match(t, base_timespan):
                timespan = t
                logger.info('matching in focus %s, timespan = %s',
                            focus['foci_id'], t['timespans_id'])
        if timespan is None:
            return json_error_response(
                'Couldn\'t find a matching timespan in the ' + focus.name +
                ' focus')
        data = topic_sentence_counts(user_mediacloud_key(),
                                     topics_id,
                                     snapshots_id=snapshots_id,
                                     timespans_id=timespan['timespans_id'],
                                     foci_id=focus['foci_id'])
        focus['sentence_counts'] = data
    return jsonify(focal_set)
Exemplo n.º 16
0
def topic_focal_set_list(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    include_story_counts = request.args.get('includeStoryCounts')
    focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(),
                                                topics_id, snapshots_id)
    # now mark the ones that are the magically added URL sharing platform ones
    for fs in focal_sets:
        fs['is_url_sharing'] = is_url_sharing_focal_set(fs)
    if include_story_counts and (include_story_counts == u'1'):
        _add_story_counts_to_foci(topics_id, focal_sets)
    return jsonify(focal_sets)
Exemplo n.º 17
0
def topic_media_csv(topics_id):
    sort = validated_sort(request.args.get('sort'))
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    return _stream_media_list_csv(user_mediacloud_key(),
                                  'media-for-topic-' + topics_id,
                                  topics_id,
                                  sort=sort,
                                  snapshots_id=snapshots_id,
                                  timespans_id=timespans_id,
                                  foci_id=foci_id,
                                  q=q)
Exemplo n.º 18
0
def matching_timespans_in_foci(topics_id, timespan_to_match, foci):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    timespans = []
    for focus in foci:
        # find the matching timespan within this focus
        snapshot_timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                        snapshots_id=snapshots_id, foci_id=focus['foci_id'])
        timespan = _matching_timespan(timespan_to_match, snapshot_timespans)
        timespans.append(timespan)
#        if timespan is None:
#            return json_error_response('Couldn\'t find a matching timespan in the '+focus.name+' focus')
    return timespans
Exemplo n.º 19
0
def topic_sentence_sample(user_mc_key, topics_id, sample_size=1000, **kwargs):
    '''
    Return a sample of sentences based on the filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    return _cached_topic_sentence_sample(user_mc_key, topics_id, sample_size, **merged_args)
Exemplo n.º 20
0
def topic_sentence_sample(user_mc_key, topics_id, sample_size=1000, **kwargs):
    """
    Return a sample of sentences based on the filters.
    """
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    return _cached_topic_sentence_sample(user_mc_key, sample_size, **merged_args)
Exemplo n.º 21
0
def topic_tag_counts(user_mc_key, topics_id, tag_sets_id):
    """
    Get a breakdown of the most-used tags within a set within a single timespan.
     This supports just timespan_id and q from the request, because it has to use sentenceFieldCount,
     not a topicSentenceFieldCount method that takes filters (which doesn't exit)
    """
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    timespan_query = "timespans_id:{}".format(timespans_id)
    if (q is None) or (len(q) == 0):
        query = timespan_query
    else:
        query = "({}) AND ({})".format(q, timespan_query)
    return _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, query)
Exemplo n.º 22
0
def topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size=None):
    '''
    Get a breakdown of the most-used tags within a set within a single timespan.
     This supports just timespan_id and q from the request, because it has to use sentenceFieldCount,
     not a topicSentenceFieldCount method that takes filters (which doesn't exit)
    '''
    # return [] # SUPER HACK!
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    timespan_query = "timespans_id:{}".format(timespans_id)
    if (q is None) or (len(q) == 0):
        query = timespan_query
    else:
        query = "({}) AND ({})".format(q, timespan_query)
    return _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size, query)
Exemplo n.º 23
0
def topic_story_count(user_mc_key, topics_id, **kwargs):
    """
    Return filtered story count within topic.
    """
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    # logger.info("!!!!!"+str(merged_args['timespans_id']))
    return _cached_topic_story_count(user_mc_key, topics_id, **merged_args)
Exemplo n.º 24
0
def topic_story_count(user_mc_key, topics_id, **kwargs):
    '''
    Return filtered story count within topic.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    # logger.info("!!!!!"+str(merged_args['timespans_id']))
    return _cached_topic_story_count(user_mc_key, topics_id, **merged_args)
Exemplo n.º 25
0
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # need the timespan info, to find the appropriate timespan with each focus
    try:
        base_timespan = base_snapshot_timespan(topics_id)
        focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, focal_sets_id)
    except ValueError as e:
        return json_error_response(e.message)
    # collect the story split counts for each foci
    timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_split_story_counts(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id,
                                                 timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['split_story_counts'] = data
    return jsonify(focal_set)
Exemplo n.º 26
0
def topic_media_list(user_mc_key, topics_id, **kwargs):
    """
    Return sorted media list based on filters.
    """
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sort': request.args.get('sort'),
        'limit': request.args.get('limit'),
        'link_id': request.args.get('linkId'),
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    return _cached_topic_media(user_mc_key, topics_id, **merged_args)
Exemplo n.º 27
0
def topic_media_list(user_mc_key, topics_id, **kwargs):
    '''
    Return sorted media list based on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sort': validated_sort(request.args.get('sort')),
        'limit': request.args.get('limit'),
        'link_id': request.args.get('linkId'),
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    return _cached_topic_media_list_with_metadata(user_mc_key, topics_id, **merged_args)
Exemplo n.º 28
0
def matching_timespans_in_foci(topics_id, timespan_to_match, foci):
    """
    For cross-subtopic analysis within a subtopic set, we need to identify the timespan that has the same date
    range in each subtopic within the set.  This helper does that annoying work for you.
    """
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    timespans = []
    for focus in foci:
        # find the matching timespan within this focus
        snapshot_timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                        snapshots_id=snapshots_id, foci_id=focus['foci_id'])
        timespan = _matching_timespan(timespan_to_match, snapshot_timespans)
        timespans.append(timespan)
#        if timespan is None:
#            return json_error_response('Couldn\'t find a matching timespan in the '+focus.name+' focus')
    return timespans
Exemplo n.º 29
0
def topic_timespan_list(topics_id, snapshots_id):
    ignored_snapshots_id, _timespans_id, foci_id, _q = filters_from_args(
        request.args)
    timespans = apicache.cached_topic_timespan_list(topics_id, snapshots_id,
                                                    foci_id)
    # add the focal_set type to the timespan so we can use that in the client (ie. decide what to show or not
    # based on what type of focal_set this timespan is part of)
    focal_sets = apicache.topic_focal_sets_list(user_mediacloud_key(),
                                                topics_id, snapshots_id)
    for t in timespans:
        for fs in focal_sets:
            for f in fs['foci']:
                if f['foci_id'] == t['foci_id']:
                    t['focal_set'] = fs
                    t['focus'] = f
                    break
    return jsonify({'list': timespans})
Exemplo n.º 30
0
def topic_sentence_sample(user_mc_key, sample_size=1000, **kwargs):
    """
    Return a sample of sentences based on the filters. A topic ID isn't needed because there is no topicSentenceList
    endpoint. Random sentence samples are pulled by using the timespans_id with a regular sentenceList call.
    """
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q
    }
    merged_args.update(
        kwargs
    )  # passed in args override anything pulled form the request.args
    return _cached_topic_sentence_sample(user_mc_key, sample_size,
                                         **merged_args)
Exemplo n.º 31
0
def matching_timespans_in_foci(topics_id, timespan_to_match, foci):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    timespans = []
    for focus in foci:
        # find the matching timespan within this focus
        snapshot_timespans = cached_topic_timespan_list(
            user_mediacloud_key(),
            topics_id,
            snapshots_id=snapshots_id,
            foci_id=focus['foci_id'])
        timespan = _matching_timespan(timespan_to_match, snapshot_timespans)
        timespans.append(timespan)


#        if timespan is None:
#            return json_error_response('Couldn\'t find a matching timespan in the '+focus.name+' focus')
    return timespans
Exemplo n.º 32
0
def topic_words(topics_id):
    sample_size = request.args[
        'sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE

    if access_public_topic(topics_id):
        results = apicache.topic_word_counts(TOOL_API_KEY,
                                             topics_id,
                                             sample_size=sample_size,
                                             snapshots_id=None,
                                             timespans_id=None,
                                             foci_id=None,
                                             q=None)
    elif is_user_logged_in():
        # grab the top words, respecting all the filters
        results = apicache.topic_word_counts(user_mediacloud_key(),
                                             topics_id,
                                             sample_size=sample_size)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    totals = [
    ]  # important so that these get reset on the client when they aren't requested
    logger.debug(request.args)
    if (is_user_logged_in()) and ('withTotals' in request.args) and (
            request.args['withTotals'] == "true"):
        # return along with the results for the overall timespan, to facilitate comparison
        snapshots_id, timespans_id, foci_id, q = filters_from_args(
            request.args)
        overall_timespan = _find_overall_timespan(topics_id, snapshots_id)
        totals = apicache.topic_word_counts(
            user_mediacloud_key(),
            topics_id,
            sample_size=sample_size,
            timespans_id=overall_timespan['timespans_id'],
            foci_id=None,
            q=None)

    response = {
        'list': results[:WORD_COUNT_UI_NUM_WORDS],
        'totals': totals[:WORD_COUNT_UI_NUM_WORDS],
        'sample_size': str(sample_size)
    }
    return jsonify(response)
Exemplo n.º 33
0
def topic_story_list(user_mc_key, topics_id, **kwargs):
    '''
    Return sorted story list based on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sort': validated_sort(request.args.get('sort')),
        'limit': request.args.get('limit'),
        'link_id': request.args.get('linkId'),
    }

    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    results = _cached_topic_story_list(user_mc_key, topics_id, **merged_args)
    if merged_args['limit']:    # TODO: remove this (force limit as workaround to back-end bug)
        results['stories'] = results['stories'][:int(merged_args['limit'])]
    return results
Exemplo n.º 34
0
def _add_story_counts_to_foci(topics_id, focal_sets):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # need the timespan info, to find the appropriate timespan with each focus
    try:
        base_timespan = base_snapshot_timespan(topics_id)
    except ValueError as e:
        return json_error_response(e.message)
    # now find the story count in each foci in this
    for fs in focal_sets:
        timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan, fs['foci'])
        for idx in range(0, len(timespans)):
            timespan = timespans[idx]
            focus = fs['foci'][idx]
            foci_story_count = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                                          snapshots_id=snapshots_id,
                                                          timespans_id=timespan['timespans_id'],
                                                          q=q,
                                                          foci_id=focus['foci_id'])['count']
            focus['story_count'] = foci_story_count
    return jsonify(focal_sets)
Exemplo n.º 35
0
def topic_story_list(user_mc_key, topics_id, **kwargs):
    # Return sorted story list based on filters.
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # these are the arguments support by the low-level API method
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sort': request.args.get('sort'),
        'limit': request.args.get('limit'),
        'link_id': request.args.get('linkId'),
    }
    # make sure not to add in other parameters from kwargs that aren't supported by the API method
    for k in TOPIC_STORY_LIST_API_PARAMS:
        if (k in merged_args) and (k in kwargs):
            merged_args[k] = kwargs[k]
    results = _cached_topic_story_list(user_mc_key, topics_id, **merged_args)
    if merged_args['limit']:    # TODO: remove this (this enforces the limit as a workaround to a back-end bug)
        results['stories'] = results['stories'][:int(merged_args['limit'])]
    return results
Exemplo n.º 36
0
def topic_tag_coverage(topics_id, tags_id):
    '''
    Useful for seeing how many stories in the topic are tagged with a specific tag
    '''
    # respect any query filter the user has set
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    query_with_tag = add_to_user_query("tags_id_stories:{}".format(tags_id))
    # now get the counts
    if access_public_topic(topics_id):
        total = topic_story_count(TOOL_API_KEY, topics_id)
        tagged = topic_story_count(
            TOOL_API_KEY, topics_id,
            q=query_with_tag)  # force a count with just the query
    elif is_user_logged_in():
        total = topic_story_count(user_mediacloud_key(), topics_id)
        tagged = topic_story_count(
            user_mediacloud_key(), topics_id,
            q=query_with_tag)  # force a count with just the query
    else:
        return None
    return {'counts': {'count': tagged['count'], 'total': total['count']}}
Exemplo n.º 37
0
def topic_sentence_counts(user_mc_key, topics_id, **kwargs):
    '''
    Return setence counts over timebased on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q
    }
    merged_args.update(
        kwargs
    )  # passed in args override anything pulled form the request.args
    # and make sure to ignore undateable stories
    undateable_query_part = "NOT tags_id_stories:{}".format(
        STORY_UNDATEABLE_TAG)  # doesn't work if the query includes parens!!!
    if merged_args['q'] is not None:
        merged_args['q'] += " AND {}".format(undateable_query_part)
    else:
        merged_args['q'] = "* AND {}".format(undateable_query_part)
    return _cached_topic_sentence_counts(user_mc_key, topics_id, **merged_args)
Exemplo n.º 38
0
def topic_word_counts(user_mc_key, topics_id, **kwargs):
    '''
    Return sampled word counts based on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sample_size': 1000
    }
    merged_args.update(
        kwargs
    )  # passed in args override anything pulled form the request.args
    word_data = _cached_topic_word_counts(user_mc_key, topics_id,
                                          **merged_args)
    words = [w['term'] for w in word_data]
    word2vec_data = _cached_word2vec_google_2d_results(words)
    for i in range(len(word2vec_data)):
        word_data[i]['google_w2v_x'] = word2vec_data[i]['x']
        word_data[i]['google_w2v_y'] = word2vec_data[i]['y']
    return word_data
Exemplo n.º 39
0
def topic_story_list(user_mc_key, topics_id, **kwargs):
    '''
    Return sorted story list based on filters.
    '''
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sort': validated_sort(request.args.get('sort')),
        'limit': request.args.get('limit'),
        'link_id': request.args.get('linkId'),
    }

    merged_args.update(
        kwargs
    )  # passed in args override anything pulled form the request.args
    results = _cached_topic_story_list(user_mc_key, topics_id, **merged_args)
    if merged_args[
            'limit']:  # TODO: remove this (force limit as workaround to back-end bug)
        results['stories'] = results['stories'][:int(merged_args['limit'])]
    return results
def topic_focal_set_split_stories_compare(topics_id, focal_sets_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # need the timespan info, to find the appropriate timespan with each focus
    base_snapshot_timespans = apicache.cached_topic_timespan_list(
        user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    # if they have a focus selected, we need to find the appropriate overall timespan
    if foci_id is not None:
        timespan = apicache.topic_timespan(topics_id, snapshots_id, foci_id,
                                           timespans_id)
        for t in base_snapshot_timespans:
            if apicache.is_timespans_match(timespan, t):
                base_timespan = t
    else:
        base_timespan = None
        for t in base_snapshot_timespans:
            if t['timespans_id'] == int(timespans_id):
                base_timespan = t
                logger.info('base timespan = %s', timespans_id)
    if base_timespan is None:
        return json_error_response("Couldn't find the timespan you specified")
    # iterate through to find the one of interest
    focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id,
                                         snapshots_id, focal_sets_id)
    if focal_set is None:
        return json_error_response('Invalid Focal Set Id')
    # collect the story split counts for each foci
    timespans = apicache.matching_timespans_in_foci(topics_id, base_timespan,
                                                    focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_split_story_counts(
            user_mediacloud_key(),
            topics_id,
            snapshots_id=snapshots_id,
            timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['split_story_counts'] = data
    return jsonify(focal_set)
Exemplo n.º 41
0
def topic_word_counts(user_mc_key, topics_id, **kwargs):
    # Return sampled word counts based on filters.
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'snapshots_id': snapshots_id,
        'timespans_id': timespans_id,
        'foci_id': foci_id,
        'q': q,
        'sample_size': WORD_COUNT_SAMPLE_SIZE,
        'num_words': WORD_COUNT_UI_NUM_WORDS
    }
    merged_args.update(kwargs)    # passed in args override anything pulled form the request.args
    word_data = cached_topic_word_counts(user_mc_key, topics_id, **merged_args)
    words = [w['term'] for w in word_data]
    # and now add in word2vec model position data
    google_word2vec_data = _cached_word2vec_google_2d_results(words)
    for i in range(len(google_word2vec_data)):
        word_data[i]['google_w2v_x'] = google_word2vec_data[i]['x']
        word_data[i]['google_w2v_y'] = google_word2vec_data[i]['y']
    topic_word2vec_data = _word2vec_topic_2d_results(topics_id, snapshots_id, words)
    for i in range(len(topic_word2vec_data)):
        word_data[i]['w2v_x'] = topic_word2vec_data[i]['x']
        word_data[i]['w2v_y'] = topic_word2vec_data[i]['y']
    return word_data
Exemplo n.º 42
0
def timespan_files_list(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    results = apicache.topic_timespan_files_list(topics_id, timespans_id)
    return jsonify(results)
Exemplo n.º 43
0
def topic_media_csv(topics_id):
    sort = validated_sort(request.args.get('sort'))
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    return _stream_media_list_csv(user_mediacloud_key(), 'media-for-topic-' + topics_id, topics_id, sort=sort,
                                  snapshots_id=snapshots_id, timespans_id=timespans_id, foci_id=foci_id, q=q)
Exemplo n.º 44
0
def media_stories_csv(topics_id, media_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    return stream_story_list_csv(user_mediacloud_key(), 'media-'+media_id+'-stories', topics_id,
                                 media_id=media_id, timespans_id=timespans_id, q=q)