def stream_story_count_csv(fn, search_id_or_query_list): ''' Helper method to stream a list of stories back to the client as a csv. Any args you pass in will be simply be passed on to a call to topicStoryList. ''' # if we have a search id, we load the samples from our sample searches file filename = '' story_count_results = [] SAMPLE_SEARCHES = load_sample_searches() try: search_id = int(search_id_or_query_list) if search_id >= 0: SAMPLE_SEARCHES = load_sample_searches() sample_queries = SAMPLE_SEARCHES[search_id]['queries'] for query in sample_queries: solr_query = prep_simple_solr_query(query) storyList = cached_story_count(solr_query) query_and_story_count = {'query' : query['label'], 'count' : storyList['count']} story_count_results.append(query_and_story_count) except Exception as e: custom_queries = json.loads(search_id_or_query_list) for query in custom_queries: solr_query = parse_query_with_keywords(query) filename = fn + query['q'] storyList = cached_story_count(solr_query) query_and_story_count = {'query' : query['label'], 'count' : storyList['count']} story_count_results.append(query_and_story_count) props = ['query','count'] return csv.stream_response(story_count_results, props, filename)
def explorer_story_count_csv(): filename = 'total-story-count' data = request.form if 'searchId' in data: # TODO: don't load this query twice because that is kind of dumb sample_searches = load_sample_searches() queries = sample_searches[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def api_explorer_demo_compare_words(): search_id = int( request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: sample_searches = load_sample_searches() compared_sample_queries = sample_searches[search_id]['queries'] results = [] for cq in compared_sample_queries: solr_q, solr_fq = parse_query_with_keywords(cq) word_count_result = query_wordcount(solr_q, solr_fq) results.append(word_count_result) else: compared_queries = request.args['compared_queries[]'].split(',') results = [] for cq in compared_queries: dictq = { x[0]: x[1] for x in [x.split("=") for x in cq[1:].split("&")] } solr_q, solr_fq = parse_query_with_keywords(dictq) word_count_result = query_wordcount(solr_q, solr_fq) results.append(word_count_result) return jsonify({"results": results})
def api_explorer_demo_sentences_count(): two_weeks_before_now = datetime.datetime.now() - datetime.timedelta( days=14) start_date = two_weeks_before_now.strftime("%Y-%m-%d") end_date = datetime.datetime.now().strftime("%Y-%m-%d") search_id = int( request.args['search_id']) if 'search_id' in request.args else None index = int(request.args['index']) if 'index' in request.args else None if isinstance(search_id, int) and search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search( request.args, current_search) if index < len(current_search): start_date = current_search[index]['startDate'] end_date = current_search[index]['endDate'] else: solr_query = parse_query_with_keywords(request.args) # why is this call fundamentally different than the cache call??? sentence_count_result = mc.sentenceCount(solr_query=solr_query, split_start_date=start_date, split_end_date=end_date, split=True) results = cached_by_query_sentence_counts(solr_query, start_date, end_date) return jsonify(results)
def explorer_story_count_csv(): filename = u'total-story-count' data = request.form if 'searchId' in data: # TODO: don't load this query twice because that is kind of dumb sample_searches = load_sample_searches() queries = sample_searches[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count( solr_q, solr_fq, solr_open_query) story_count_results.append({ 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def api_explorer_story_split_count_csv(): filename = 'stories-over-time' data = request.form if 'searchId' in data: solr_q, solr_fq = parse_as_sample(data['searchId'], data['index']) filename = filename # don't have this info + current_query['q'] SAMPLE_SEARCHES = load_sample_searches() queries = SAMPLE_SEARCHES[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: solr_q, solr_fq = parse_query_with_keywords(q) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=q['sources'], tags_ids=q['collections']) story_counts = apicache.normalized_and_story_count(solr_q, solr_fq, solr_open_query) story_count_results.append({ 'date': q['startDate'], 'query': q['label'], 'matching_stories': story_counts['total'], 'total_stories': story_counts['normalized_total'], 'ratio': float(story_counts['total']) / float(story_counts['normalized_total']) }) props = ['date','query', 'matching_stories', 'total_stories', 'ratio'] return csv.stream_response(story_count_results, props, filename)
def stream_geo_csv(fn, search_id_or_query, index): filename = '' # TODO: there is duplicate code here... SAMPLE_SEARCHES = load_sample_searches() try: search_id = int(search_id_or_query) if search_id >= 0: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search( search_id, current_search) if int(index) < len(current_search): start_date = current_search[int(index)]['startDate'] end_date = current_search[int(index)]['endDate'] filename = fn + current_search[int(index)]['q'] except Exception as e: # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0 query = json.loads(search_id_or_query) current_query = query[0] solr_query = parse_query_with_keywords(current_query) filename = fn + current_query['q'] res = cached_geotags(solr_query) res = [ r for r in res if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys() ] for r in res: geonamesId = int(r['tag'].split('_')[1]) if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys( ): # only include countries continue r['geonamesId'] = geonamesId r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId] r['count'] = ( float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE) ) # WTF: why is the API returning this as a string and not a number? for hq in HIGHCHARTS_KEYS: if hq['properties']['iso-a3'] == r['alpha3']: r['iso-a2'] = hq['properties']['iso-a2'] r['value'] = r['count'] props = ['label', 'count'] return csv.stream_response(res, props, filename)
def demo_top_tags_with_coverage(tag_sets_id,): # parses the query for you search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: sample_searches = load_sample_searches() current_search = sample_searches[search_id]['queries'] solr_q, solr_fq = parse_query_with_args_and_sample_search(request.args, current_search) else: solr_q, solr_fq = parse_query_with_keywords(request.args) return apicache.top_tags_with_coverage(solr_q, solr_fq, tag_sets_id)
def demo_top_tags_with_coverage(tag_sets_id,): # parses the query for you search_id = int(request.args['search_id']) if 'search_id' in request.args else None query_index = int(request.args['index']) if 'index' in request.args else None if (query_index is None )and (search_id not in [None, -1]): sample_searches = load_sample_searches() current_search = sample_searches[search_id]['queries'] solr_q, solr_fq = parse_as_sample(search_id, request.args) else: solr_q, solr_fq = parse_query_with_keywords(request.args) return apicache.top_tags_with_coverage(solr_q, solr_fq, tag_sets_id)
def api_explorer_demo_geotag_count(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_q, solr_fq = parse_query_with_args_and_sample_search(request.args, current_search) else: solr_q, solr_fq= parse_query_with_keywords(request.args) data = apicache.top_tags_with_coverage(solr_q, solr_fq, tags.GEO_TAG_SET) data['results'] = _filter_for_countries(data['results']) return jsonify(data)
def api_explorer_demo_story_sample(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search(request.args, current_search) else: solr_query = parse_query_with_keywords(request.args) story_count_result = cached_story_samples(solr_query) return jsonify(story_count_result)
def api_explorer_demo_story_count(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search(request.args, current_search) else: solr_query = parse_query_with_keywords(request.args) story_count_result = cached_story_count(solr_query) # maybe check admin role before we run this? return jsonify(story_count_result) # give them back new data, so they can update the client
def stream_sentence_count_csv(fn, search_id_or_query, index): two_weeks_before_now = datetime.datetime.now() - datetime.timedelta( days=14) start_date = two_weeks_before_now.strftime("%Y-%m-%d") end_date = datetime.datetime.now().strftime("%Y-%m-%d") SAMPLE_SEARCHES = load_sample_searches() # TODO: some duplicate code here try: search_id = int(search_id_or_query) if search_id >= 0: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search( search_id, current_search) if int(index) < len(current_search): start_date = current_search[int(index)]['startDate'] end_date = current_search[int(index)]['endDate'] filename = fn + current_search[int(index)]['q'] except Exception as e: # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0 query = json.loads(search_id_or_query) current_query = query[0] solr_query = parse_query_with_keywords( current_query ) # TODO don't mod the start and end date unless permissions filename = fn + current_query['q'] results = cached_by_query_sentence_counts( solr_query, start_date, end_date) # get dates out of query? clean_results = [{ 'date': date, 'sentences': count } for date, count in results['split'].iteritems() if date not in ['gap', 'start', 'end']] clean_results = sorted(clean_results, key=itemgetter('date')) props = ['date', 'sentences'] return csv.stream_response(clean_results, props, filename)
def explorer_wordcount_csv(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search(request.args, current_search) else: solr_query = parse_query_with_keywords(request.args) # TODO what about other params: date etc for demo.. return stream_wordcount_csv(mc, 'wordcounts-Explorer', solr_query)
def api_explorer_demo_story_sample(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: sample_searches = load_sample_searches() current_search = sample_searches[search_id]['queries'] solr_q, solr_fq = parse_query_with_args_and_sample_search(request.args, current_search) else: solr_q, solr_fq = parse_query_with_keywords(request.args) story_sample_result = apicache.random_story_list(solr_q, solr_fq, SAMPLE_STORY_COUNT) for story in story_sample_result: story["media"] = apicache.media(story["media_id"]) return jsonify(story_sample_result)
def api_explorer_demo_story_sample(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: sample_searches = load_sample_searches() current_search = sample_searches[search_id]['queries'] solr_q, solr_fq = parse_as_sample(search_id, request.args['index']) else: solr_q, solr_fq = parse_query_with_keywords(request.args) story_sample_result = apicache.random_story_list(solr_q, solr_fq, SAMPLE_STORY_COUNT) for story in story_sample_result: story["media"] = server.views.apicache.media(story["media_id"]) return jsonify({"results": story_sample_result})
def get_word_count(): search_id = int( request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: sample_searches = load_sample_searches() current_search = sample_searches[search_id]['queries'] solr_q, solr_fq = parse_query_with_args_and_sample_search( request.args, current_search) else: solr_q, solr_fq = parse_query_with_keywords(request.args) word_data = query_wordcount(solr_q, solr_fq) # return combined data return jsonify({"list": word_data})
def explorer_stories_csv(search_id_or_query, index): filename = '' SAMPLE_SEARCHES = load_sample_searches() try: search_id = int(search_id_or_query) if search_id >= 0: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search(search_id, current_search) if int(index) < len(current_search): start_date = current_search[int(index)]['startDate'] end_date = current_search[int(index)]['endDate'] filename = 'explorer-stories-' + current_search[int(index)]['q'] except Exception as e: # so far, we will only be fielding one keyword csv query at a time, so we can use index of 0 query = json.loads(search_id_or_query) current_query = query[0] solr_query = parse_query_with_keywords(current_query) filename = 'explorer-stories-' + current_query['q'] story_count_result = cached_story_samples(solr_query) return stream_story_samples_csv(filename, story_count_result)
def _get_word_count(): search_id = int( request.args['search_id']) if 'search_id' in request.args else None sample_size = int( request.args['sample_size'] ) if 'sample_size' in request.args else WORD_COUNT_SAMPLE_SIZE if search_id not in [None, -1]: sample_searches = load_sample_searches() current_search = sample_searches[search_id]['queries'] solr_q, solr_fq = parse_query_with_args_and_sample_search( request.args, current_search) else: solr_q, solr_fq = parse_query_with_keywords(request.args) word_data = query_wordcount(solr_q, solr_fq, sample_size=sample_size) # return combined data return jsonify({"list": word_data, "sample_size": str(sample_size)})
def api_explorer_demo_story_split_count(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if isinstance(search_id, int) and search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_q, solr_fq = parse_as_sample(search_id, request.args['index']) else: solr_q, solr_fq = parse_query_with_keywords(request.args) # why is this call fundamentally different than the cache call??? solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=[], tags_ids=DEFAULT_COLLECTION_IDS) results = apicache.normalized_and_story_split_count(solr_q, solr_fq, solr_open_query) return jsonify({'results': results})
def geotag_count(): two_weeks_before_now = datetime.datetime.now() - datetime.timedelta( days=14) start_date = two_weeks_before_now.strftime("%Y-%m-%d") end_date = datetime.datetime.now().strftime("%Y-%m-%d") search_id = int( request.args['search_id']) if 'search_id' in request.args else None index = int(request.args['index']) if 'index' in request.args else None if search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_query = parse_query_with_args_and_sample_search( request.args, current_search) else: solr_query = parse_query_with_keywords(request.args) # TODO coverage here # total_stories = mc.storyCount(solr_query) # geotagged_stories = mc.storyCount("({}) AND (tags_id_stories:{})".format(solr_query, CLIFF_CLAVIN_2_3_0_TAG_ID)) # coverage_pct = float(geotagged_stories) / float(total_stories) res = cached_geotags(solr_query) res = [ r for r in res if int(r['tag'].split('_')[1]) in COUNTRY_GEONAMES_ID_TO_APLHA3.keys() ] for r in res: geonamesId = int(r['tag'].split('_')[1]) if geonamesId not in COUNTRY_GEONAMES_ID_TO_APLHA3.keys( ): # only include countries continue r['geonamesId'] = geonamesId # TODO: move this to JS? r['alpha3'] = COUNTRY_GEONAMES_ID_TO_APLHA3[geonamesId] r['count'] = ( float(r['count']) / float(tag_utl.GEO_SAMPLE_SIZE) ) # WTF: why is the API returning this as a string and not a number? for hq in HIGHCHARTS_KEYS: if hq['properties']['iso-a3'] == r['alpha3']: r['iso-a2'] = hq['properties']['iso-a2'] r['value'] = r['count'] # results = {'coverage': coverage_pct, 'list': res } return jsonify(res)
def api_explorer_demo_story_split_count(): search_id = int( request.args['search_id']) if 'search_id' in request.args else None if isinstance(search_id, int) and search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_q, solr_fq = parse_query_with_args_and_sample_search( request.args, current_search) else: solr_q, solr_fq = parse_query_with_keywords(request.args) # why is this call fundamentally different than the cache call??? solr_open_query = concatenate_query_for_solr( solr_seed_query='*', media_ids=[], tags_ids=DEFAULT_COLLECTION_IDS) results = apicache.normalized_and_story_split_count( solr_q, solr_fq, solr_open_query) return jsonify({'results': results})
def api_explorer_story_split_count(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None index = int(request.args['index']) if 'index' in request.args else None #get specific stories by keyword if isinstance(search_id, int) and search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_q, solr_fq = parse_as_sample(search_id, request.args['index']) else: solr_q, solr_fq = parse_query_with_keywords(request.args) # get all the stories (no keyword) solr_open_query = concatenate_query_for_solr(solr_seed_query='*', media_ids=request.args['sources'], tags_ids=request.args['collections']) results = apicache.normalized_and_story_split_count(solr_q, solr_fq, solr_open_query) return jsonify({'results': results})
def api_explorer_story_split_count(): search_id = int( request.args['search_id']) if 'search_id' in request.args else None index = int(request.args['index']) if 'index' in request.args else None if isinstance(search_id, int) and search_id not in [None, -1]: SAMPLE_SEARCHES = load_sample_searches() current_search = SAMPLE_SEARCHES[search_id]['queries'] solr_q, solr_fq = parse_query_with_args_and_sample_search( request.args, current_search) else: solr_q, solr_fq = parse_query_with_keywords(request.args) solr_open_query = concatenate_query_for_solr( solr_seed_query='*', media_ids=request.args['sources'], tags_ids=request.args['collections']) results = apicache.normalized_and_story_split_count( solr_q, solr_fq, solr_open_query) return jsonify({'results': results})
def api_explorer_demo_compare_words(): search_id = int(request.args['search_id']) if 'search_id' in request.args else None if search_id not in [None, -1]: sample_searches = load_sample_searches() compared_sample_queries = sample_searches[search_id]['queries'] results = [] for cq in compared_sample_queries: solr_q, solr_fq = parse_query_with_keywords(cq) word_count_result = query_wordcount(solr_q, solr_fq) results.append(word_count_result) else: compared_queries = request.args['compared_queries[]'].split(',') results = [] for cq in compared_queries: dictq = {x[0]:x[1] for x in [x.split("=") for x in cq[1:].split("&")]} solr_q, solr_fq = parse_query_with_keywords(dictq) word_count_result = query_wordcount(solr_q, solr_fq) results.append(word_count_result) return jsonify({"results": results})
import logging from flask import jsonify, request import flask_login import json from server import app import server.util.csv as csv import server.util.pushshift as pushshift from server.util.request import api_error_handler from server.views.explorer import parse_as_sample,\ parse_query_with_keywords, load_sample_searches, file_name_for_download, concatenate_query_for_solr,\ DEFAULT_COLLECTION_IDS, only_queries_reddit, parse_query_dates import server.views.explorer.apicache as apicache SAMPLE_SEARCHES = load_sample_searches() logger = logging.getLogger(__name__) @app.route('/api/explorer/stories/count.csv', methods=['POST']) def explorer_story_count_csv(): filename = 'total-story-count' data = request.form if 'searchId' in data: queries = SAMPLE_SEARCHES[data['searchId']]['queries'] else: queries = json.loads(data['queries']) label = " ".join([q['label'] for q in queries]) filename = file_name_for_download(label, filename) # now compute total attention for all results story_count_results = [] for q in queries: