Пример #1
0
def explorer_story_count_csv():
    filename = 'total-story-count'
    data = request.form
    queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        if (len(q['collections']) == 0) and only_queries_reddit(q['sources']):
            start_date, end_date = parse_query_dates(q)
            provider = RedditPushshiftProvider()
            story_counts = provider.normalized_count_over_time(query=q['q'],
                                                               start_date=start_date,
                                                               end_date=end_date,
                                                               subreddits=NEWS_SUBREDDITS)
        else:
            solr_q, solr_fq = parse_query_with_keywords(q)
            solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'],
                                                         tags_ids=q['collections'])
            story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query)
        story_count_results.append({
            'query': q['label'],
            'matching_stories': story_counts['total'],
            'total_stories': story_counts['normalized_total'],
            'ratio': float(story_counts['total']) / float(story_counts['normalized_total'])
        })
    props = ['query', 'matching_stories', 'total_stories', 'ratio']
    return csv.stream_response(story_count_results, props, filename)
Пример #2
0
def parse_query_with_keywords(args):
    solr_q = ''
    solr_fq = None
    # should I break this out into just a parse_query_with_keywords routine where we add in the start/end date without relying that the
    # try statement will fail?
    try:  # if user arguments are present and allowed by the client endpoint, use them, otherwise use defaults
        current_query = args['q']
        if current_query == '':
            current_query = "*"
        start_date, end_date = parse_query_dates(args)
        media_ids = _parse_media_ids(args)
        collections = _parse_collection_ids(args)
        searches = args['searches'] if 'searches' in args else []
        solr_q = concatenate_query_for_solr(solr_seed_query=current_query,
                                            media_ids=media_ids,
                                            tags_ids=collections,
                                            custom_ids=searches)
        solr_fq = dates_as_filter_query(start_date.strftime("%Y-%m-%d"),
                                        end_date.strftime("%Y-%m-%d"))
    # otherwise, default
    except Exception as e:
        logger.warning(
            "user custom query failed, there's a problem with the arguments " +
            str(e))
    return solr_q, solr_fq
Пример #3
0
 def _as_query_and_filter_query(cls, query: str, start_date: dt.datetime,
                                end_date: dt.datetime,
                                **kwargs) -> (str, str):
     """
     Take all the query params and return q and fq suitable for a media cloud solr-syntax query
     :param query:
     :param start_date:
     :param end_date:
     :param kwargs: sources and collections
     :return:
     """
     media_ids = kwargs['sources'] if 'sources' in kwargs else []
     tags_ids = kwargs['collections'] if 'collections' in kwargs else []
     q = concatenate_query_for_solr(query, media_ids, tags_ids)
     fq = MediaCloud.dates_as_query_clause(start_date, end_date)
     return q, fq
Пример #4
0
def _topic_seed_story_count(topic):
    try:
        seed_query_count = shared_apicache.story_count(
            q=concatenate_query_for_solr(
                solr_seed_query=topic['solr_seed_query'],
                media_ids=[
                    m['media_id'] for m in topic['media'] if 'media_id' in m
                ],
                tags_ids=[
                    t['tags_id'] for t in topic['media_tags'] if 'tags_id' in t
                ]),
            fq=concatenate_solr_dates(start_date=topic['start_date'],
                                      end_date=topic['end_date']))['count']
    except mediacloud.error.MCException:
        # the query syntax is wrong (perhaps pre-story-level search)
        seed_query_count = None
    return seed_query_count
Пример #5
0
def api_explorer_story_split_count():
    start_date, end_date = parse_query_dates(request.form)
    if only_queries_reddit(request.form):
        provider = RedditPushshiftProvider()
        results = provider.normalized_count_over_time(query=request.form['q'],
                                                      start_date=start_date, end_date=end_date,
                                                      subreddits=NEWS_SUBREDDITS)
    else:
        # get specific stories by keyword
        solr_q, _solr_fq = parse_query_with_keywords(request.form)
        # get all the stories (no keyword) so we can support normalization
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*',
                                                     media_ids=request.form['sources'],
                                                     tags_ids=request.form['collections'],
                                                     custom_ids=request.form['searches'])
        results = apicache.normalized_and_story_split_count(solr_q, solr_open_query, start_date, end_date)
    return jsonify({'results': results})
Пример #6
0
def api_explorer_combined_story_split_count_csv():
    filename = 'stories-over-time'
    data = request.form
    queries = json.loads(data['queries'])
    label = " ".join([q['label'] for q in queries])
    filename = file_name_for_download(label, filename)
    # now compute total attention for all results
    story_count_results = []
    for q in queries:
        start_date, end_date = parse_query_dates(q)
        if (len(q['collections']) == 0) and only_queries_reddit(q['sources']):
            provider = RedditPushshiftProvider()
            story_counts = provider.normalized_count_over_time(
                query=q['q'],
                start_date=start_date,
                end_date=end_date,
                subreddits=NEWS_SUBREDDITS)
        else:
            solr_q, solr_fq = parse_query_with_keywords(q)
            solr_open_query = concatenate_query_for_solr(
                solr_seed_query='*',
                media_ids=q['sources'],
                tags_ids=q['collections'],
                custom_ids=q['searches'])
            story_counts = apicache.normalized_and_story_split_count(
                solr_q, solr_open_query, start_date, end_date)
        story_count_results.append({
            'label': q['label'],
            'by_date': story_counts['counts'],
        })
    # now combine them by date
    data = []
    dates = [d['date'] for d in story_count_results[0]['by_date']]
    for idx in range(len(dates)):
        row = {'date': dates[idx]}
        for q in story_count_results:
            row[q['label'] + '-count'] = q['by_date'][idx]['count']
            row[q['label'] + '-total_count'] = q['by_date'][idx]['total_count']
            row[q['label'] + '-ratio'] = q['by_date'][idx]['ratio']
        data.append(row)
    props = ['date'] + [q['label'] + '-count' for q in queries] + [
        q['label'] + '-total_count' for q in queries
    ] + [q['label'] + '-ratio' for q in queries]
    return csv.stream_response(data, props, filename)
Пример #7
0
def api_explorer_story_split_count_csv():
    filename = 'stories-over-time'
    data = request.form
    q = json.loads(data['q'])
    filename = file_name_for_download(q['label'], filename)
    # now compute total attention for all results
    start_date, end_date = parse_query_dates(q)
    if (len(q['collections']) == 0) and only_queries_reddit(q['sources']):
        provider = RedditPushshiftProvider()
        story_counts = provider.normalized_count_over_time(query=q['q'],
                                                           start_date=start_date,
                                                           end_date=end_date,
                                                           subreddits=NEWS_SUBREDDITS)
    else:
        solr_q, _solr_fq = parse_query_with_keywords(q)
        solr_open_query = concatenate_query_for_solr(solr_seed_query='*',
                                                     media_ids=q['sources'],
                                                     tags_ids=q['collections'],
                                                     custom_ids=q['searches'])
        story_counts = apicache.normalized_and_story_split_count(solr_q, solr_open_query, start_date, end_date)
    props = ['date', 'count', 'total_count', 'ratio']
    return csv.stream_response(story_counts['counts'], props, filename)