예제 #1
0
def topic_words(topics_id):
    sample_size = request.args['sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE

    if access_public_topic(topics_id):
        results = apicache.topic_word_counts(TOOL_API_KEY, topics_id, sample_size=sample_size,
                                             snapshots_id=None, timespans_id=None, foci_id=None, q=None)
    elif is_user_logged_in():
        # grab the top words, respecting all the filters
        results = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    totals = []  # important so that these get reset on the client when they aren't requested
    logger.debug(request.args)
    if (is_user_logged_in()) and ('withTotals' in request.args) and (request.args['withTotals'] == "true"):
        # return along with the results for the overall timespan, to facilitate comparison
        snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
        overall_timespan = _find_overall_timespan(topics_id, snapshots_id)
        totals = apicache.topic_word_counts(user_mediacloud_key(), topics_id, sample_size=sample_size,
                                            timespans_id=overall_timespan['timespans_id'], foci_id=None, q=None)

    response = {
        'list': results[:WORD_COUNT_UI_NUM_WORDS],
        'totals': totals[:WORD_COUNT_UI_NUM_WORDS],
        'sample_size': str(sample_size)
    }
    return jsonify(response)
예제 #2
0
def topic_words(topics_id):

    if access_public_topic(topics_id):
        results = topic_word_counts(TOOL_API_KEY,
                                    topics_id,
                                    snapshots_id=None,
                                    timespans_id=None,
                                    foci_id=None,
                                    q=None)
    elif is_user_logged_in():
        results = topic_word_counts(user_mediacloud_key(), topics_id)[:200]
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    totals = [
    ]  # important so that these get reset on the client when they aren't requested
    logger.info(request.args)
    if (is_user_logged_in()) and ('withTotals' in request.args) and (
            request.args['withTotals'] == "true"):
        # handle requests to return these results
        # and also data to compare it to for the whole topic focus
        totals = topic_word_counts(user_mediacloud_key(),
                                   topics_id,
                                   timespans_id=None,
                                   q=None)
    response = {'list': results, 'totals': totals}
    return jsonify(response)
예제 #3
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id, snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: u"{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
예제 #4
0
def topic_w2v_timespan_embeddings(topics_id):
    args = {
        'snapshots_id': request.args.get('snapshotId'),
        'foci_id': request.args.get('focusId'),
        'q': request.args.get('q'),
    }

    # Retrieve embeddings for overall topic
    overall_word_counts = topic_word_counts(user_mediacloud_key(),
                                            topics_id,
                                            num_words=50,
                                            **args)
    overall_words = [x['term'] for x in overall_word_counts]
    overall_embeddings = {
        x['term']: (x['google_w2v_x'], x['google_w2v_y'])
        for x in overall_word_counts
    }

    # Retrieve top words for each timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                           args['snapshots_id'],
                                           args['foci_id'])

    # Retrieve embeddings for each timespan
    p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES)
    func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id,
                   args, overall_words, overall_embeddings)
    ts_embeddings = p.map(func, timespans)

    return jsonify({'list': ts_embeddings})
예제 #5
0
def topic_compare_subtopic_top_words(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    selected_focal_sets_id = request.args['focal_sets_id']
    word_count = request.args['word_count'] if 'word_count' in request.args else 20
    # first we need to figure out which timespan they are working on
    selected_snapshot_timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                                      snapshots_id=snapshots_id)
    selected_timespan = None
    for t in selected_snapshot_timespans:
        if t['timespans_id'] == int(timespans_id):
            selected_timespan = t
    try:
        focal_set = apicache.topic_focal_set(user_mediacloud_key(), topics_id, snapshots_id, selected_focal_sets_id)
    except ValueError:
        return json_error_response('Invalid Focal Set Id')
    timespans = apicache.matching_timespans_in_foci(topics_id, selected_timespan, focal_set['foci'])
    for idx in range(0, len(timespans)):
        data = apicache.topic_word_counts(user_mediacloud_key(), topics_id,
                                          timespans_id=timespans[idx]['timespans_id'])
        focal_set['foci'][idx]['top_words'] = data
    # stitch together the counts to download now
    data = []
    headers = [f['name'] for f in focal_set['foci']]
    for idx in range(0, word_count):
        row = {f['name']: "{} ({})".format(f['top_words'][idx]['term'], f['top_words'][idx]['count'])
               for f in focal_set['foci']}
        data.append(row)
    return csv.stream_response(data, headers,
                               'topic-{}-subtopic-{}-{}-top-words-comparison'.format(
                                   topics_id, focal_set['name'], selected_focal_sets_id))
예제 #6
0
def topic_w2v_timespan_embeddings(topics_id):
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    # Retrieve embeddings for overall topic
    overall_word_counts = apicache.topic_word_counts(user_mediacloud_key(),
                                                     topics_id,
                                                     num_words=50,
                                                     snapshots_id=snapshots_id,
                                                     timespans_id=None,
                                                     foci_id=foci_id,
                                                     q=q)
    overall_words = [x['term'] for x in overall_word_counts]
    overall_embeddings = {
        x['term']: (x['google_w2v_x'], x['google_w2v_y'])
        for x in overall_word_counts
    }

    # Retrieve top words for each timespan
    timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(),
                                                    topics_id, snapshots_id,
                                                    foci_id)

    # Retrieve embeddings for each timespan
    jobs = [{
        'api_key': user_mediacloud_key(),
        'topics_id': topics_id,
        'snapshots_id': snapshots_id,
        'foci_id': foci_id,
        'overall_words': overall_words,
        'overall_embeddings': overall_embeddings,
        'q': q,
        'timespan': t,
    } for t in timespans]
    embeddings_by_timespan = _get_all_timespan_embeddings(jobs)
    return jsonify({'list': embeddings_by_timespan})
예제 #7
0
def topic_words(topics_id):
    sample_size = request.args[
        'sample_size'] if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE

    if access_public_topic(topics_id):
        results = apicache.topic_word_counts(TOOL_API_KEY,
                                             topics_id,
                                             sample_size=sample_size,
                                             snapshots_id=None,
                                             timespans_id=None,
                                             foci_id=None,
                                             q=None)
    elif is_user_logged_in():
        # grab the top words, respecting all the filters
        results = apicache.topic_word_counts(user_mediacloud_key(),
                                             topics_id,
                                             sample_size=sample_size)
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    totals = [
    ]  # important so that these get reset on the client when they aren't requested
    logger.debug(request.args)
    if (is_user_logged_in()) and ('withTotals' in request.args) and (
            request.args['withTotals'] == "true"):
        # return along with the results for the overall timespan, to facilitate comparison
        snapshots_id, timespans_id, foci_id, q = filters_from_args(
            request.args)
        overall_timespan = _find_overall_timespan(topics_id, snapshots_id)
        totals = apicache.topic_word_counts(
            user_mediacloud_key(),
            topics_id,
            sample_size=sample_size,
            timespans_id=overall_timespan['timespans_id'],
            foci_id=None,
            q=None)

    response = {
        'list': results[:WORD_COUNT_UI_NUM_WORDS],
        'totals': totals[:WORD_COUNT_UI_NUM_WORDS],
        'sample_size': str(sample_size)
    }
    return jsonify(response)
예제 #8
0
def topic_w2v_timespan_embeddings(topics_id):
    args = {
        'snapshots_id': request.args.get('snapshotId'),
        'foci_id': request.args.get('focusId'),
        'q': request.args.get('q'),
    }

    # Retrieve embeddings for overall topic
    overall_word_counts = topic_word_counts(user_mediacloud_key(), topics_id, num_words=50, **args)
    overall_words = [x['term'] for x in overall_word_counts]
    overall_embeddings = {x['term']: (x['google_w2v_x'], x['google_w2v_y']) for x in overall_word_counts}

    # Retrieve top words for each timespan
    timespans = cached_topic_timespan_list(user_mediacloud_key(), topics_id, args['snapshots_id'], args['foci_id'])

    # Retrieve embeddings for each timespan
    p = Pool(processes=WORD2VEC_TIMESPAN_POOL_PROCESSES)
    func = partial(grab_timespan_embeddings, user_mediacloud_key(), topics_id, args, overall_words, overall_embeddings)
    ts_embeddings = p.map(func, timespans)

    return jsonify({'list': ts_embeddings})
예제 #9
0
def media_words(topics_id, media_id):
    query = apicache.add_to_user_query('media_id:'+media_id)
    word_list = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=query)[:100]
    return jsonify(word_list)
예제 #10
0
def story_words(topics_id, stories_id):
    word_list = topic_word_counts(user_mediacloud_key(), topics_id, q='stories_id:'+stories_id)[:100]
    return jsonify(word_list)
예제 #11
0
def topic_words_csv(topics_id):
    response = topic_word_counts(user_mediacloud_key(), topics_id)
    props = ['term', 'stem', 'count']
    return csv.stream_response(response, props, 'sampled-words')
예제 #12
0
def media_words_csv(topics_id, media_id):
    query = add_to_user_query('media_id:'+media_id)
    word_list = topic_word_counts(user_mediacloud_key(), topics_id, q=query)
    props = ['term', 'stem', 'count']
    return csv.stream_response(word_list, props, 'media-'+str(media_id)+'-words')
예제 #13
0
def topic_word(topics_id, word):
    response = topic_word_counts(user_mediacloud_key(), topics_id, q=word)[:1]
    logger.info(response)
    return jsonify(response)
예제 #14
0
def topic_word_associated_words_csv(topics_id, word):
    response = topic_word_counts(user_mediacloud_key(), topics_id, q=word)
    props = ['term', 'stem', 'count']
    return csv.stream_response(response, props,
                               'word-' + word + '-sampled-words')
예제 #15
0
def topic_word_associated_words(topics_id, word):
    response = topic_word_counts(user_mediacloud_key(), topics_id,
                                 q=word)[:100]
    return jsonify(response)
예제 #16
0
def topic_word_associated_words(topics_id, word):
    query = apicache.add_to_user_query(word)
    response = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=query)[:100]
    return jsonify(response)
예제 #17
0
def media_words(topics_id, media_id):
    query = apicache.add_to_user_query('media_id:' + media_id)
    word_list = apicache.topic_word_counts(user_mediacloud_key(),
                                           topics_id,
                                           q=query)[:100]
    return jsonify(word_list)
예제 #18
0
def story_words_csv(topics_id, stories_id):
    word_list = topic_word_counts(user_mediacloud_key(), topics_id, q='stories_id:'+stories_id)
    props = ['term', 'stem', 'count']
    return csv.stream_response(word_list, props, 'story-'+str(stories_id)+'-words')
예제 #19
0
def topic_word(topics_id, word):
    response = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=word)[:1]
    return jsonify(response)
예제 #20
0
def topic_word_associated_words(topics_id, word):
    query = apicache.add_to_user_query(word)
    response = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q=query)[:100]
    return jsonify(response)
예제 #21
0
def story_words(topics_id, stories_id):
    word_list = apicache.topic_word_counts(user_mediacloud_key(), topics_id, q='stories_id:'+stories_id)[:100]
    return jsonify(word_list)
예제 #22
0
def topic_word(topics_id, word):
    response = apicache.topic_word_counts(user_mediacloud_key(),
                                          topics_id,
                                          q=word)[:1]
    return jsonify(response)