def analyze(): # return "Analysis" last_update = client.get(index='appdata', doc_type='doc', id=1)['_source']['last_update'] content = dict( bookmarked=query.run_query(Q("term",bookmarked=True), index=['projects','publications'])[:1000].execute(), obj1=query.run_query(Q("term",objectives="objective1"), index=['projects','publications'])[:1000].execute(), obj2=query.run_query(Q("term",objectives="objective2"), index=['projects','publications'])[:1000].execute(), obj3=query.run_query(Q("term",objectives="objective3"), index=['projects','publications'])[:1000].execute(), obj4=query.run_query(Q("term",objectives="objective4"), index=['projects','publications'])[:1000].execute(), ) formdata = dict( type=-1, query=-1, index=-1, topic=-1, element=-1, status=-1, date_range=-1, sort_by=-1 ) return render_template('analyze.html', title='Analyze', heading='Dashboard', last_update=last_update, content=content, formdata=formdata)
def explore(): # get and handle form data search_type = request.form.get('type','search') search_query = request.form.get('query') index = request.form.get('index','projects') filter_topic = request.form.get('topic','all') filter_element = request.form.get('element','all') filter_status = request.form.get('status','all') date_range = request.form.get('dateRange','50') sort_by = request.form.get('sortBy','date') doc_type = index[:-1] filters=dict( element = filter_element, status = filter_status, date_range = date_range, sort_by = sort_by, doc_type = doc_type ) content = dict() for topic in topics: # run query and process response kwargs = query.get_query_arguments(topic) q = query.Query(**kwargs) s = query.run_query(q.query, index=index, filters=filters) s = s[:100] # pagination r = s.execute() content[topic] = r formdata = dict( type=search_type, query=search_query, index=index, topic=filter_topic, element=filter_element, status=filter_status, date_range=date_range, sort_by=sort_by ) buttonStates=dict( topic="None", element = filter_element, status = filter_status, date_range = date_range, sort_by = sort_by, doc_type = doc_type ) last_update = client.get(index='appdata', doc_type='doc', id=1)['_source']['last_update'] return render_template('explore.html', content=content, buttonStates=buttonStates, formdata=formdata, heading='Explore', title='Explore', last_update=last_update)
def more_like_this(): index = request.form.get('index','projects') doc_id = request.form.get('doc_id') q = Q( { "more_like_this": { "fields": [ "title", "abstract" ], "like": [ { "_index": index, "_type":"doc", "_id": doc_id } ] } } ) s = query.run_query(q, index=index) s = s[:5] r = s.execute() return jsonify(r.hits.hits)
def project_count_by_topic(**kwargs): topic_query = kwargs.get("topic") topic_filter = kwargs.get("topic_selection") element_filter = kwargs.get("element") filters = dict(element=element_filter, topic=topic_filter) # run query kwargs = query.get_query_arguments(topic_query) q = query.Query(**kwargs) s = query.run_query(q.query, index=index, filters=filters) count = s.count() # aggregate doc ids a1 = A( "terms", field="_id", size=5000, ) # chain aggregations and execute s.aggs.bucket('doc_ids', a1) response = s.execute() # filter response doc_ids = [] for b in response.aggregations.doc_ids.buckets: doc_ids.append(b['key']) return count, doc_ids
def publication_count(queries=None): # search object s = Search(using=client, index='publications') if queries: tag = queries.get("tag") element_tag = queries.get("element_tag") filters = dict(topic=tag, element=element_tag) index = 'publications' # kwargs = query.get_query_arguments(tag) # q = query.Query(**kwargs) s = query.run_query(Q({"match_all": {}}), index=index, filters=filters) count = s.count() else: # query total = Q({"match_phrase": {"doc_type": {"query": "publication"}}}) count = s.query(total).count() return count
def project_count_by_state(queries=None): # search object s = Search(using=client, index=index) if queries: tag = queries.get("tag") element_tag = queries.get("element_tag") filters = dict(topic=tag, element=element_tag) s = query.run_query(Q({"match_all": {}}), index=index, filters=filters) # aggregations a1 = A("nested", path="funding_agencies") a2 = A("terms", field="funding_agencies.state.keyword", size=50, order={"_count": "desc"}) a3 = A( "terms", field="_id", size=5000, ) # chain aggregations and execute s.aggs\ .bucket('agencies', a1)\ .bucket('states',a2)\ .bucket('doc_ids', a3) response = s.execute() # filter response res = {} for b in response.aggregations.agencies.states.buckets: state = b['key'] doc_count = b['doc_count'] res[state] = dict(doc_count=doc_count, doc_ids=[doc['key'] for doc in b.doc_ids.buckets]) return res
def project_count(queries=None): # search object s = Search(using=client, index=index) allStatus = ['Active', 'Completed', 'Programmed', 'Proposed'] if queries: tag = queries.get("tag") element_tag = queries.get("element_tag") filters = dict(element=element_tag, topic=tag) # run query # if tag == 'all': # s = query.run_query(Q({"match_all":{}}), index=index, filters=filters) # else: # kwargs = query.get_query_arguments(tag) # q = query.Query(**kwargs) # s = query.run_query(q.query, index='projects', filters=filters) s = query.run_query(Q({"match_all": {}}), index=index, filters=filters) res = {} res['total'] = s.count() for status in allStatus: res[status.lower()] = s.filter("match", status=status).count() else: # query total = Q({"match_phrase": {"doc_type": {"query": "project"}}}) s = s.query(total) res = {} res['total'] = s.count() for status in allStatus: q = Q({"match_phrase": {"status.keyword": {"query": status}}}) res[status.lower()] = s.query(q).count() return res
def tag_documents(index_name, topic_tags, element_tags): def process_hits(hits): for item in hits: id = item['_id'] index_name = item["_index"] if index_name == 'projects': doc = models.Project.get(using=client, index=index_name, id=id) elif index_name == 'publications': doc = models.Publication.get(using=client, index=index_name, id=id) doc.update(using=client, index=index_name, request_timeout=20, tags=list(), element_tags=list()) print(f'{index_name} - doc ({id}): tags removed') def remove_tags(index_name): # Init scroll by search data = client.search(index=index_name, doc_type='doc', scroll='2m', size=1000, body={"query": { "match_all": {} }}) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits process_hits(data['hits']['hits']) while scroll_size > 0: data = client.scroll(scroll_id=sid, scroll='2m') # Process current batch of hits process_hits(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) # first remove all tags remove_tags(index_name) # topic tags for tag in topic_tags: kwargs = query.get_query_arguments(tag) q = query.Query(**kwargs) s = query.run_query(q.query, index=index_name) hits, _ = query.process_search_response(s, last=s.count()) for id in hits: if index_name == 'projects': doc = models.Project.get(using=client, index=index_name, id=id) elif index_name == 'publications': doc = models.Publication.get(using=client, index=index_name, id=id) if doc.tags: current_tags = list(doc.tags) else: current_tags = [] current_tags.append(tag) current_tags_set = set(current_tags) doc.update(using=client, index=index_name, tags=list(current_tags_set)) print(f'{index_name} - doc ({id}): updated with {tag}') # element tags for tag in element_tags: kwargs = query.get_query_arguments(tag) q = query.Query(**kwargs) s = query.run_query(q.query, index=index_name) hits, _ = query.process_search_response(s, last=s.count()) for id in hits: if index_name == 'projects': doc = models.Project.get(using=client, index=index_name, id=id) elif index_name == 'publications': doc = models.Publication.get(using=client, index=index_name, id=id) if doc.element_tags: current_tags = list(doc.element_tags) else: current_tags = [] current_tags.append(tag) current_tags_set = set(current_tags) doc.update(using=client, index=index_name, element_tags=list(current_tags_set)) print(f'{index_name} - doc ({id}): updated with {tag}')
def results(): if request.referrer.split('/')[-1] == 'update': return redirect(url_for('results')) # format for front end display formatstr = lambda s: s.replace("_"," ") if request.method == 'GET': # retrieve get requests search_type = request.args.get('type','search') search_query = request.args.get('query') index = request.args.get('index','projects') filter_topic = request.args.get('topic','all') filter_element = request.args.get('element','all') filter_status = request.args.get('status','all') date_range = request.args.get('dateRange','50') sort_by = request.args.get('sortBy','date') doc_type = index[:-1] if request.method == 'POST' and request.form['form'] == 'filters': # retrieve form submission search_type = request.form.get('type','search') search_query = request.form.get('query') index = request.form.get('index','projects') filter_topic = request.form.get('topic','all') filter_element = request.form.get('element','all') filter_status = request.form.get('status','all') date_range = request.form.get('dateRange','50') sort_by = request.form.get('sortBy','date') doc_type = index[:-1] # handle requests if search_type == 'click_count': # if user clicked project or pub count in dashboard if filter_topic != 'all' and filter_element != 'all': # user filtered topic and elements in dashboard clicked = f'for "{formatstr(filter_topic)}" and "{formatstr(filter_element)}"' elif filter_topic != 'all' and filter_element =='all': # user filtered topic in dashboard clicked = f'for "{formatstr(filter_topic)}"' elif filter_topic == 'all' and filter_element != 'all': # user filtered element in dashboard clicked = f'for "{formatstr(filter_element)}"' elif filter_topic == 'all' and filter_element == 'all': # no filters clicked = f"for all {index}" filters = dict( topic=filter_topic, element=filter_element, doc_type=doc_type, date_range=date_range, status = filter_status, sort_by=sort_by ) if filter_topic == 'all': if filter_element == 'all': q = Q({"match_all": {}}) # note: sorting does not apply to match all s = query.run_query(q, index=index, filters=filters) else: kwargs = query.get_query_arguments(filter_element) q = query.Query(**kwargs) s = query.run_query(q.query, index=index, filters=filters) else: kwargs = query.get_query_arguments(filter_topic) q = query.Query(**kwargs) s = query.run_query(q.query, index=index, filters=filters) elif search_type == 'click_bar': # if user clicked on bar chart if search_query == filter_topic and filter_element != 'all': # user filtered topic and elements in dashboard clicked = f'for "{formatstr(search_query)}" and "{formatstr(filter_element)}"' elif search_query == filter_topic and filter_element =='all': # user filtered topic in dashboard clicked = f'for "{formatstr(search_query)}"' elif search_query != filter_topic and filter_topic != 'all' and filter_element != 'all': # user filtered topic and elements in dashboard, and clicked on different bar clicked = f'for "{formatstr(search_query)}", "{formatstr(filter_topic)}", and "{formatstr(filter_element)}"' elif search_query != filter_topic and filter_topic != 'all' and filter_element == 'all': # user filtered topic in dashboard, and clicked on different bar clicked = f'for "{formatstr(search_query)}", and "{formatstr(filter_topic)}"' elif search_query != filter_topic and filter_topic == 'all' and filter_element != 'all': # user filtered elements in dashboard, and clicked on different bar clicked = f'for "{formatstr(search_query)}", and "{formatstr(filter_element)}"' else: # no filters clicked = f'for "{formatstr(search_query)}"' filters = dict( topic = filter_topic, element = filter_element, doc_type = doc_type, status = filter_status, date_range = date_range, sort_by = sort_by ) kwargs = query.get_query_arguments(search_query) q = query.Query(**kwargs) s = query.run_query(q.query, index=index, filters=filters) elif search_type == 'click_map': # if user clicked state on map if filter_topic != 'all' and filter_element != 'all': # user filtered topic and elements in dashboard clicked = f'for "{search_query}", "{formatstr(filter_topic)}", and "{formatstr(filter_element)}"' elif filter_topic != 'all' and filter_element =='all': # user filtered topic in dashboard clicked = f'for "{search_query}", and "{formatstr(filter_topic)}"' elif filter_topic == 'all' and filter_element != 'all': # user filtered element in dashboard clicked = f'for "{search_query}", and "{formatstr(filter_element)}"' elif filter_topic == 'all' and filter_element == 'all': # no filters clicked = f'for "{search_query}"' filters = dict( topic=filter_topic, element=filter_element, doc_type=doc_type, status = filter_status, date_range = date_range, sort_by=sort_by ) q = Q({"nested" : { "path" : "funding_agencies", "query" : { "bool" : { "must" : [ { "match" : {"funding_agencies.state" : search_query} } ] } } } } ) s = query.run_query(q, index=index, filters=filters) elif search_type == 'search' and search_query != 'None': # if a free search was requested by the user if filter_topic != 'all' and filter_element != 'all': # user filtered topic and elements in dashboard clicked = f'for "{search_query}", "{formatstr(filter_topic)}" and "{formatstr(filter_element)}"' elif filter_topic != 'all' and filter_element =='all': # user filtered topic in dashboard clicked = f'for "{search_query}", and "{formatstr(filter_topic)}"' elif filter_topic == 'all' and filter_element != 'all': # user filtered element in dashboard clicked = f'for "{search_query}", and "{formatstr(filter_element)}"' elif filter_topic == 'all' and filter_element == 'all': # no filters clicked = f'for "{search_query}"' filters = dict( topic=filter_topic, element=filter_element, doc_type=doc_type, status = filter_status, date_range = date_range, sort_by=sort_by ) q = Q({"multi_match" : { "query" : search_query, "fields" : [ "title", "abstract" ] } }) s = query.run_query(q, index=index, filters=filters) else: if request.referrer.split('/')[-1] == 'explore': return redirect(url_for('explore')) s = s[:1000] # pagination r = s.execute() # print(r[0].objectives) buttonStates=dict( type = search_type, topic = filter_topic, element = filter_element, status = filter_status, date_range = date_range, sort_by = sort_by, doc_type = doc_type ) formdata = dict( type=search_type, query=search_query, index=index, topic=filter_topic, element=filter_element, status=filter_status, date_range=date_range, sort_by=sort_by ) last_update = client.get(index='appdata', doc_type='doc', id=1)['_source']['last_update'] return render_template('results.html', title='Results', heading=f'Search Results', content=r, clicked=clicked, buttonStates=buttonStates, formdata=formdata, last_update=last_update)
def funding_by_state(**kwargs): # topic_query = kwargs.get("topic") topic = kwargs.get("topic") element = kwargs.get("element") filters = dict(element=element, topic=topic) # run query s = query.run_query(Q({"match_all": {}}), index=index, filters=filters) # aggregations a1 = A("nested", path="funding_agencies") a2 = A( "terms", field="funding_agencies.state.keyword", size=50, order={"_count": "desc"}, ) a3 = A("reverse_nested") a4 = A("range", field="funding", ranges=[{ "from": 0, "to": 100000 }, { "from": 100000, "to": 250000 }, { "from": 250000, "to": 500000 }, { "from": 500000, "to": 750000 }, { "from": 750000, "to": 1000000 }, { "from": 1000000 }], keyed=True) # chain aggregations and execute s.aggs\ .bucket('agencies', a1)\ .bucket('states',a2)\ .bucket('reverse',a3)\ .bucket('fund_amt',a4) response = s.execute() # filter response res = {} for b in response.aggregations.agencies.states.buckets: state = b.key if len(state) > 2: continue if state in res: continue buckets = b.reverse.fund_amt.buckets.to_dict() res[state] = buckets return res