def execute_entities_query(args, query, doc_counts=False): """Execute the query and return a set of results.""" result, hits, output = execute_basic(TYPE_ENTITY, query) convert_entity_aggregations(result, output, args) sub_queries = [] for doc in hits.get('hits', []): entity = doc.get('_source') entity['id'] = doc.get('_id') entity['score'] = doc.get('_score') entity['api_url'] = url_for('entities_api.view', id=doc.get('_id')) output['results'].append(entity) sq = {'term': {'entities.uuid': entity['id']}} sq = authz_sources_filter(sq) sq = {'size': 0, 'query': sq} sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) if doc_counts and len(sub_queries): res = get_es().msearch(index=get_es_index(), doc_type=TYPE_DOCUMENT, body='\n'.join(sub_queries)) for (entity, res) in zip(output['results'], res.get('responses')): entity['doc_count'] = res.get('hits', {}).get('total') return output
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_filter(q) if alert.entity_id: q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}}) q = {'query': q, 'size': 150} result, hits, output = execute_basic(TYPE_DOCUMENT, q) collections = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['collections'] = [] for coll in document['collection_id']: if coll not in authz.collections(authz.READ): continue if coll not in collections: collections[coll] = Collection.by_id(coll) if collections[coll] is None: continue document['collections'].append(collections[coll]) document['records'] = {'results': [], 'total': 0} output['results'].append(document) return output
def execute_documents_query(args, query): """Execute the query and return a set of results.""" begin_time = time.time() result, hits, output = execute_basic(TYPE_DOCUMENT, query) query_duration = (time.time() - begin_time) * 1000 log.debug('Query ES time: %.5fms', query_duration) convert_document_aggregations(result, output, args) query_duration = (time.time() - begin_time) * 1000 log.debug('Post-facet accumulated: %.5fms', query_duration) sub_shoulds = records_query_shoulds(args) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} # TODO: restore entity highlighting somehow. sq = records_query_internal(document['id'], sub_shoulds) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) query_duration = (time.time() - begin_time) * 1000 log.debug('Post-subquery accumulated: %.5fms', query_duration) return output
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_sources_filter(q) if alert.entity_id: q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}}) q = {'query': q, 'size': 150} result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] sources = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) source_id = document['source_id'] if source_id not in sources: sources[source_id] = Source.by_id(source_id) if sources[source_id] is None: continue document['source'] = sources[source_id] document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def execute_tabular_query(query): """Execute a query against records and return a set of results.""" result, hits, output = execute_basic(TYPE_RECORD, query) for rec in hits.get('hits', []): record = rec.get('_source').get('raw') record['_id'] = rec.get('_source', {}).get('row_id') output['results'].append(record) return output
def leads_query(collection_id, state): q = {'term': {'entity_collection_id': collection_id}} q = authz_filter(q, state.authz, roles=True) aggs = {'scoped': {'global': {}, 'aggs': {}}} facets = list(state.facet_names) if 'collections' in facets: aggs = facet_collections(state, q, aggs) facets.remove('collections') aggs = aggregate(state, q, aggs, facets) q = { 'sort': [{ 'judgement': 'asc' }, { 'score': 'desc' }, { 'match_id': 'asc' }], 'query': filter_query(q, state.filters), 'aggregations': aggs, 'size': state.limit, 'from': state.offset } result, hits, output = execute_basic(TYPE_LEAD, q) output['facets'] = parse_facet_result(state, result) entities = set([]) for doc in hits.get('hits', []): link = doc.get('_source') link['id'] = doc.get('_id') entities.add(link.get('entity_id')) entities.add(link.get('match_id')) output['results'].append(link) q = {'terms': {'_id': list(entities)}} q = {'query': q, 'size': len(entities) + 2} _, hits, _ = execute_basic(TYPE_ENTITY, q) for doc in hits.get('hits', []): entity = doc.get('_source') entity['id'] = doc.get('_id') for result in output['results']: if result.get('match_id') == entity['id']: result['match'] = entity if result.get('entity_id') == entity['id']: result['entity'] = entity return output
def execute_records_query(query): """Execute a query against records and return a set of results.""" result, hits, output = execute_basic(TYPE_RECORD, query) for rec in hits.get('hits', []): record = rec.get('_source') record['score'] = rec.get('_score') record['text'] = rec.get('highlight', {}).get('text') output['results'].append(record) return output
def execute_records_query(query): """Execute a query against records and return a set of results.""" result, hits, output = execute_basic(TYPE_RECORD, query) for rec in hits.get("hits", []): record = rec.get("_source") record["score"] = rec.get("_score") record["text"] = rec.get("highlight", {}).get("text") output["results"].append(record) return output
def links_query(origin, state): """Parse a user query string, compose and execute a query.""" if state.has_text: q = { "query_string": { "query": state.text, "fields": ['name^5', 'names^2', 'text'], "default_operator": "AND", "use_dis_max": True } } else: q = match_all() ids = origin.get('ids') or [origin.get('id')] q = add_filter(q, {'terms': {'origin.id': ids}}) q = authz_filter(q, state.authz, roles=True) aggs = {'scoped': {'global': {}, 'aggs': {}}} aggs = aggregate(state, q, aggs, state.facet_names) if state.sort == 'score': sort = ['_score'] else: sort = [{ 'properties.start_date': 'desc' }, { 'properties.end_date': 'desc' }] q = { 'sort': sort, 'query': filter_query(q, state.filters), 'aggregations': aggs, 'size': state.limit, 'from': state.offset, '_source': DEFAULT_FIELDS } result, hits, output = execute_basic(TYPE_LINK, q) output['facets'] = parse_facet_result(state, result) for doc in hits.get('hits', []): link = doc.get('_source') link['id'] = doc.get('_id') link['score'] = doc.get('_score') output['results'].append(link) return output
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_filter(q) if alert.entity_id: q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, { "range": { "created_at": { "gt": alert.notified_at } } }) q = { 'query': q, 'size': 150 } result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] collections = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['collections'] = [] for coll in document['collection_id']: if coll not in authz.collections(authz.READ): continue if coll not in collections: collections[coll] = Collection.by_id(coll) if collections[coll] is None: continue document['collections'].append(collections[coll]) document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def execute_records_query(document_id, state, query): """Execute a query against records and return a set of results.""" result, hits, output = execute_basic(TYPE_RECORD, query) rows = [] for rec in hits.get('hits', []): record = rec.get('_source') record['score'] = rec.get('_score') if record.get('row_id'): rows.append((record.get('sheet'), record.get('row_id'))) for text in rec.get('highlight', {}).get('text', []): record['text'] = text output['results'].append(record) for record in DocumentRecord.find_rows(document_id, rows): for res in output['results']: if res['sheet'] == record.sheet and res['row_id'] == record.row_id: res['data'] = record.data return output
def execute_records_query(document_id, state, query): """Execute a query against records and return a set of results.""" result, hits, output = execute_basic(TYPE_RECORD, query) ids = [] for rec in hits.get('hits', []): record = rec.get('_source') record['score'] = rec.get('_score') record['id'] = int(rec.get('_id')) ids.append(rec.get('_id')) for text in rec.get('highlight', {}).get('text', []): record['text'] = text output['results'].append(record) for record in DocumentRecord.find_records(document_id, ids): for result in output['results']: if result['id'] == record.id: result['data'] = record.data result['text'] = record.text return output
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_sources_filter(q) if alert.entity_id: q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, { "range": { "created_at": { "gt": alert.notified_at } } }) q = { 'query': q, 'size': 150 } result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] sources = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) source_id = document['source_id'] if source_id not in sources: sources[source_id] = Source.by_id(source_id) if sources[source_id] is None: continue document['source'] = sources[source_id] document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def execute_documents_query(args, query): """Execute the query and return a set of results.""" result, hits, output = execute_basic(TYPE_DOCUMENT, query) convert_document_aggregations(result, output, args) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], args) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) document['api_url'] = url_for('documents_api.view', document_id=doc.get('_id')) document['data_url'] = url_for('documents_api.file', document_id=doc.get('_id')) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def execute_documents_alert_query(args, query): """Execute the query and return a set of results.""" if not isinstance(args, MultiDict): args = MultiDict(args) query['size'] = 50 result, hits, output = execute_basic(TYPE_DOCUMENT, query) convert_aggregations(result, output, args) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) for source in output['sources']['values']: if source['id'] == document['source_id']: document['source'] = source document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], args, size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def execute_documents_query(args, query): """Execute the query and return a set of results.""" result, hits, output = execute_basic(TYPE_DOCUMENT, query) convert_aggregations(result, output, args) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], args) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) document['api_url'] = url_for('documents_api.view', document_id=doc.get('_id')) document['data_url'] = url_for('documents_api.file', document_id=doc.get('_id')) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def new_doc_count(): #XXX this needs to be cached query = {'query': {'range': {'created_at': {'gte': 'now-7d/d'}}}} res = execute_basic('document', query) total = res[1].get('total') return jsonify({'week': total})
def entities_query(state, fields=None, facets=True, doc_counts=False): """Parse a user query string, compose and execute a query.""" if state.has_text: q = { "query_string": { "query": state.text, "fields": ['name^5', 'names^2', 'text'], "default_operator": "AND", "use_dis_max": True } } else: q = match_all() if state.raw_query: q = {"bool": {"must": [q, state.raw_query]}} q = authz_filter(q, state.authz, roles=True) aggs = {'scoped': {'global': {}, 'aggs': {}}} if facets: facets = list(state.facet_names) if 'collections' in facets: aggs = facet_collections(state, q, aggs) facets.remove('collections') aggs = aggregate(state, q, aggs, facets) if state.sort == 'doc_count': sort = [{'doc_count': 'desc'}, '_score'] elif state.sort == 'score': sort = ['_score', {'name_sort': 'asc'}] else: sort = [{'name_sort': 'asc'}] # pprint(q) q = { 'sort': sort, 'query': filter_query(q, state.filters), 'aggregations': aggs, 'size': state.limit, 'from': state.offset, '_source': fields or DEFAULT_FIELDS } result, hits, output = execute_basic(TYPE_ENTITY, q) output['facets'] = parse_facet_result(state, result) sub_queries = [] for doc in hits.get('hits', []): entity = doc.get('_source') entity['id'] = doc.get('_id') entity['score'] = doc.get('_score') entity['api_url'] = url_for('entities_api.view', id=doc.get('_id')) output['results'].append(entity) sq = {'term': {'entities.id': entity['id']}} sq = add_filter( sq, {'terms': { 'collection_id': state.authz.collections_read }}) sq = {'size': 0, 'query': sq} sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) if doc_counts and len(sub_queries): # Get the number of matching documents for each entity. body = '\n'.join(sub_queries) res = es.msearch(index=es_index, doc_type=TYPE_DOCUMENT, body=body) for (entity, res) in zip(output['results'], res.get('responses')): entity['doc_count'] = res.get('hits', {}).get('total') return output
def documents_query(state, fields=None, facets=True, since=None): """Parse a user query string, compose and execute a query.""" # This used to be several functions, but it's actually incredibly # procedural and so it's been linearised into one function. To really # clean this up, I think it should be based around an object model of # some sort. q = text_query(state.text) if state.raw_query: q = {"bool": {"must": [q, state.raw_query]}} q = authz_filter(q, state.authz, roles=False) # Used by alerting to find only updated results: if since is not None: q = add_filter(q, {"range": {"created_at": {"gt": since}}}) # Sorting if state.sort == 'newest': sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score'] if state.sort == 'oldest': sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score'] else: sort = ['_score'] # TODO: find a better way to handle "slightly special" aggregations like # entities and collections. aggs = {'scoped': {'global': {}, 'aggs': {}}} if facets: facets = list(state.facet_names) if 'collections' in facets: aggs = facet_collections(state, q, aggs) facets.remove('collections') if 'entities' in facets: aggs = facet_entities(state, aggs) facets.remove('entities') aggs = aggregate(state, q, aggs, facets) # allow plug-ins to post-process the query. signals.document_query_process.send(q=q, state=state) q = { 'sort': sort, 'size': state.limit, 'from': state.offset, 'query': filter_query(q, state.filters), 'aggregations': aggs, '_source': fields or DEFAULT_FIELDS } result, hits, output = execute_basic(TYPE_DOCUMENT, q) # This will add labels and other contextual information. output['facets'] = parse_facet_result(state, result) # After the main query has run, a sub-query will be run for each returned # result in order to find relevant records for result highlighting. sub_shoulds = records_query_shoulds(state) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} collection_id = document.get('collection_id') try: # FIXME: there's got to be a nicer way of doing this.... document['public'] = state.authz.collection_public(collection_id) except: document['public'] = None sq = records_query_internal(document['id'], sub_shoulds) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) if not len(sub_queries): return output body = '\n'.join(sub_queries) res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body=body) for doc in output['results']: for sq in res.get('responses', []): sqhits = sq.get('hits', {}) doc['records']['total'] = sqhits.get('total', 0) for hit in sqhits.get('hits', {}): record = hit.get('_source') if doc['id'] != record['document_id']: continue hlt = hit.get('highlight', {}) texts = hlt.get('text', []) or hlt.get('text_latin', []) texts = [clean_highlight(t) for t in texts] texts = [t for t in texts if len(t)] if len(texts): record['text'] = texts[0] doc['records']['results'].append(record) return output