def filter_query(q, filters): """Apply a list of filters to the given query.""" for field, values in filters.items(): if field == 'collection_id' and len(values): q = add_filter(q, {'terms': {field: list(values)}}) elif field == 'dataset' and len(values): q = add_filter(q, {'terms': {field: list(values)}}) elif field == 'publication_date': # XXX handle multiple values date_from = list(values)[0] # YYYY-MM date_from = date_from + "-01" date_from = datetime.datetime.strptime(date_from, "%Y-%m-%d") date_to = date_from + datetime.timedelta(days=32) date_to = date_to.replace(day=1) q = add_filter( q, { 'range': { field: { 'gte': date_from.strftime("%Y-%m-%d"), 'lt': date_to.strftime("%Y-%m-%d") } } }) else: for value in values: if value is not None: q = add_filter(q, {'term': {field: value}}) return q
def filter_query(q, filters): """Apply a list of filters to the given query.""" for field, values in filters.items(): if field == 'collection_id' and len(values): q = add_filter(q, {'terms': {field: list(values)}}) elif field == 'dataset' and len(values): q = add_filter(q, {'terms': {field: list(values)}}) else: for value in values: if value is not None: q = add_filter(q, {'term': {field: value}}) return q
def filter_query(q, filters, skip=None): """Apply a list of filters to the given query.""" or_filters = defaultdict(list) for field, value in filters: if field == skip: continue if field in OR_FIELDS: or_filters[field].append(value) else: q = add_filter(q, {'term': {field: value}}) for field, value in or_filters.items(): q = add_filter(q, {'terms': {field: value}}) return q
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_sources_filter(q) if alert.entity_id: q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}}) q = {'query': q, 'size': 150} result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] sources = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) source_id = document['source_id'] if source_id not in sources: sources[source_id] = Source.by_id(source_id) if sources[source_id] is None: continue document['source'] = sources[source_id] document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def peek_query(args): if not isinstance(args, MultiDict): args = MultiDict(args) text = args.get('q', '').strip() q = text_query(text) filters = parse_filters(args) for entity in args.getlist('entity'): filters.append(('entities.id', entity)) q = filter_query(q, filters, []) q = add_filter( q, {'not': { 'terms': { 'collection_id': authz.collections(authz.READ) } }}) q = { 'query': q, 'size': 0, 'aggregations': { 'collections': { 'terms': { 'field': 'collection_id', 'size': 30 } } }, '_source': False } # import json # print json.dumps(q, indent=2) result = get_es().search(index=get_es_index(), body=q, doc_type=TYPE_DOCUMENT) aggs = result.get('aggregations', {}).get('collections', {}) buckets = aggs.get('buckets', []) q = Collection.all_by_ids([b['key'] for b in buckets]) q = q.filter(Collection.creator_id != None) # noqa objs = {o.id: o for o in q.all()} roles = {} for bucket in buckets: collection = objs.get(bucket.get('key')) if collection is None or collection.private: continue if collection.creator_id in roles: roles[collection.creator_id]['total'] += bucket.get('doc_count') else: roles[collection.creator_id] = { 'name': collection.creator.name, 'email': collection.creator.email, 'total': bucket.get('doc_count') } roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True) roles = [format_total(r) for r in roles] total = result.get('hits', {}).get('total') return format_total({'roles': roles, 'active': total > 0, 'total': total})
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_filter(q) if alert.entity_id: q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}}) q = {'query': q, 'size': 150} result, hits, output = execute_basic(TYPE_DOCUMENT, q) collections = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['collections'] = [] for coll in document['collection_id']: if coll not in authz.collections(authz.READ): continue if coll not in collections: collections[coll] = Collection.by_id(coll) if collections[coll] is None: continue document['collections'].append(collections[coll]) document['records'] = {'results': [], 'total': 0} output['results'].append(document) return output
def peek_query(args): if not isinstance(args, MultiDict): args = MultiDict(args) text = args.get('q', '').strip() q = text_query(text) filters = parse_filters(args) for entity in args.getlist('entity'): filters.append(('entities.id', entity)) q = filter_query(q, filters, []) q = add_filter(q, { 'not': { 'terms': { 'collection_id': authz.collections(authz.READ) } } }) q = { 'query': q, 'size': 0, 'aggregations': { 'collections': { 'terms': {'field': 'collection_id', 'size': 30} } }, '_source': False } # import json # print json.dumps(q, indent=2) result = get_es().search(index=get_es_index(), body=q, doc_type=TYPE_DOCUMENT) aggs = result.get('aggregations', {}).get('collections', {}) buckets = aggs.get('buckets', []) q = Collection.all_by_ids([b['key'] for b in buckets]) q = q.filter(Collection.creator_id != None) # noqa objs = {o.id: o for o in q.all()} roles = {} for bucket in buckets: collection = objs.get(bucket.get('key')) if collection is None or collection.private: continue if collection.creator_id in roles: roles[collection.creator_id]['total'] += bucket.get('doc_count') else: roles[collection.creator_id] = { 'name': collection.creator.name, 'email': collection.creator.email, 'total': bucket.get('doc_count') } roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True) roles = [format_total(r) for r in roles] total = result.get('hits', {}).get('total') return format_total({ 'roles': roles, 'active': total > 0, 'total': total })
def tabular_query(document_id, sheet, args): scored = False q = match_all() text = args.get('q', '').strip() if len(text): scored = True q = text_query_string(text) try: rows = [int(r) for r in args.getlist('row')] except Exception: rows = [] if len(rows): scored = True q = { "bool": { "must": [q], "should": [{ "constant_score": { "filter": { 'terms': { 'row_id': rows } }, "boost": 1000 } }] } } q = add_filter(q, {'term': {'document_id': document_id}}) q = add_filter(q, {'term': {'sheet': sheet}}) # pprint(q) sort = [{'row_id': 'asc'}] if scored: sort.insert(0, '_score') return { 'from': 0, 'size': 100, 'query': q, 'sort': sort, '_source': ['document_id', 'sheet', 'row_id', 'raw'] }
def authz_filter(q, authz, roles=False): if authz.is_admin: return q fq = {'terms': {'collection_id': list(authz.collections_read)}} if roles: fq = {"or": [{'terms': {'roles': list(authz.roles)}}, fq]} return add_filter(q, fq)
def authz_filter(q, authz, roles=False): if authz.is_admin: return q fq = {'terms': {'collection_id': list(authz.collections_read)}} if roles: iq = {'terms': {'roles': list(authz.roles)}} fq = {'bool': {'should': [iq, fq], 'minimum_should_match': 1}} return add_filter(q, fq)
def tabular_query(document_id, sheet, args): scored = False q = match_all() text = args.get('q', '').strip() if len(text): scored = True q = text_query_string(text) try: rows = [int(r) for r in args.getlist('row')] except Exception: rows = [] if len(rows): scored = True q = { "bool": { "must": q, "should": { "constant_score": { "filter": {'terms': {'row_id': rows}}, "boost": 1000 } } } } q = add_filter(q, {'term': {'document_id': document_id}}) q = add_filter(q, {'term': {'sheet': sheet}}) # pprint(q) sort = [{'row_id': 'asc'}] if scored: sort.insert(0, '_score') return { 'from': 0, 'size': 100, 'query': q, 'sort': sort, '_source': ['document_id', 'sheet', 'row_id', 'raw'] }
def documents_query(args, fields=None, facets=True, newer_than=None): """Parse a user query string, compose and execute a query.""" if not isinstance(args, MultiDict): args = MultiDict(args) text = args.get('q', '').strip() q = text_query(text) q = authz_filter(q) if newer_than is not None: q = add_filter(q, { "range": { "created_at": { "gt": newer_than } } }) # Sorting -- should this be passed into search directly, instead of # these aliases? sort_mode = args.get('sort', '').strip().lower() if text or sort_mode == 'score': sort = ['_score'] elif sort_mode == 'newest': sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score'] elif sort_mode == 'oldest': sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score'] else: sort = [{'updated_at': 'desc'}, {'created_at': 'desc'}, '_score'] # Extract filters, given in the form: &filter:foo_field=bla_value filters = [] for key in args.keys(): for value in args.getlist(key): if not key.startswith('filter:'): continue _, field = key.split(':', 1) filters.append((field, value)) for entity in args.getlist('entity'): filters.append(('entities.uuid', entity)) aggs = {} if facets: aggs = aggregate(q, args, filters) aggs = facet_source(q, aggs, filters) q = entity_collections(q, aggs, args, filters) return { 'sort': sort, 'query': filter_query(q, filters), 'aggregations': aggs, '_source': fields or DEFAULT_FIELDS }
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_filter(q) if alert.entity_id: q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, { "range": { "created_at": { "gt": alert.notified_at } } }) q = { 'query': q, 'size': 150 } result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] collections = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['collections'] = [] for coll in document['collection_id']: if coll not in authz.collections(authz.READ): continue if coll not in collections: collections[coll] = Collection.by_id(coll) if collections[coll] is None: continue document['collections'].append(collections[coll]) document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def alert_query(alert): """Execute the query and return a set of results.""" q = text_query(alert.query_text) q = authz_sources_filter(q) if alert.entity_id: q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS) if alert.notified_at: q = add_filter(q, { "range": { "created_at": { "gt": alert.notified_at } } }) q = { 'query': q, 'size': 150 } result, hits, output = execute_basic(TYPE_DOCUMENT, q) sub_queries = [] sources = {} for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) source_id = document['source_id'] if source_id not in sources: sources[source_id] = Source.by_id(source_id) if sources[source_id] is None: continue document['source'] = sources[source_id] document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], alert.to_query(), size=1) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) run_sub_queries(output, sub_queries) return output
def documents_query(state, fields=None, facets=True, since=None): """Parse a user query string, compose and execute a query.""" # This used to be several functions, but it's actually incredibly # procedural and so it's been linearised into one function. To really # clean this up, I think it should be based around an object model of # some sort. q = text_query(state.text) if state.raw_query: q = {"bool": {"must": [q, state.raw_query]}} q = authz_filter(q, state.authz, roles=False) # Used by alerting to find only updated results: if since is not None: q = add_filter(q, {"range": {"created_at": {"gt": since}}}) # Sorting if state.sort == 'newest': sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score'] if state.sort == 'oldest': sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score'] else: sort = ['_score'] # TODO: find a better way to handle "slightly special" aggregations like # entities and collections. aggs = {'scoped': {'global': {}, 'aggs': {}}} if facets: facets = list(state.facet_names) if 'collections' in facets: aggs = facet_collections(state, q, aggs) facets.remove('collections') if 'entities' in facets: aggs = facet_entities(state, aggs) facets.remove('entities') aggs = aggregate(state, q, aggs, facets) # allow plug-ins to post-process the query. signals.document_query_process.send(q=q, state=state) q = { 'sort': sort, 'size': state.limit, 'from': state.offset, 'query': filter_query(q, state.filters), 'aggregations': aggs, '_source': fields or DEFAULT_FIELDS } result, hits, output = execute_basic(TYPE_DOCUMENT, q) # This will add labels and other contextual information. output['facets'] = parse_facet_result(state, result) # After the main query has run, a sub-query will be run for each returned # result in order to find relevant records for result highlighting. sub_shoulds = records_query_shoulds(state) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} collection_id = document.get('collection_id') try: # FIXME: there's got to be a nicer way of doing this.... document['public'] = state.authz.collection_public(collection_id) except: document['public'] = None sq = records_query_internal(document['id'], sub_shoulds) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) if not len(sub_queries): return output body = '\n'.join(sub_queries) res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body=body) for doc in output['results']: for sq in res.get('responses', []): sqhits = sq.get('hits', {}) doc['records']['total'] = sqhits.get('total', 0) for hit in sqhits.get('hits', {}): record = hit.get('_source') if doc['id'] != record['document_id']: continue hlt = hit.get('highlight', {}) texts = hlt.get('text', []) or hlt.get('text_latin', []) texts = [clean_highlight(t) for t in texts] texts = [t for t in texts if len(t)] if len(texts): record['text'] = texts[0] doc['records']['results'].append(record) return output