Пример #1
0
def filter_query(q, filters):
    """Apply a list of filters to the given query."""
    for field, values in filters.items():
        if field == 'collection_id' and len(values):
            q = add_filter(q, {'terms': {field: list(values)}})
        elif field == 'dataset' and len(values):
            q = add_filter(q, {'terms': {field: list(values)}})
        elif field == 'publication_date':
            # XXX handle multiple values
            date_from = list(values)[0]

            # YYYY-MM
            date_from = date_from + "-01"
            date_from = datetime.datetime.strptime(date_from, "%Y-%m-%d")
            date_to = date_from + datetime.timedelta(days=32)
            date_to = date_to.replace(day=1)

            q = add_filter(
                q, {
                    'range': {
                        field: {
                            'gte': date_from.strftime("%Y-%m-%d"),
                            'lt': date_to.strftime("%Y-%m-%d")
                        }
                    }
                })
        else:
            for value in values:
                if value is not None:
                    q = add_filter(q, {'term': {field: value}})
    return q
Пример #2
0
def filter_query(q, filters):
    """Apply a list of filters to the given query."""
    for field, values in filters.items():
        if field == 'collection_id' and len(values):
            q = add_filter(q, {'terms': {field: list(values)}})
        elif field == 'dataset' and len(values):
            q = add_filter(q, {'terms': {field: list(values)}})
        else:
            for value in values:
                if value is not None:
                    q = add_filter(q, {'term': {field: value}})
    return q
Пример #3
0
def filter_query(q, filters, skip=None):
    """Apply a list of filters to the given query."""
    or_filters = defaultdict(list)
    for field, value in filters:
        if field == skip:
            continue
        if field in OR_FIELDS:
            or_filters[field].append(value)
        else:
            q = add_filter(q, {'term': {field: value}})
    for field, value in or_filters.items():
        q = add_filter(q, {'terms': {field: value}})
    return q
Пример #4
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_sources_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}})
    q = {'query': q, 'size': 150}

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    sub_queries = []
    sources = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        source_id = document['source_id']
        if source_id not in sources:
            sources[source_id] = Source.by_id(source_id)
        if sources[source_id] is None:
            continue
        document['source'] = sources[source_id]
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], alert.to_query(), size=1)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Пример #5
0
def peek_query(args):
    if not isinstance(args, MultiDict):
        args = MultiDict(args)
    text = args.get('q', '').strip()
    q = text_query(text)

    filters = parse_filters(args)
    for entity in args.getlist('entity'):
        filters.append(('entities.id', entity))

    q = filter_query(q, filters, [])
    q = add_filter(
        q,
        {'not': {
            'terms': {
                'collection_id': authz.collections(authz.READ)
            }
        }})
    q = {
        'query': q,
        'size': 0,
        'aggregations': {
            'collections': {
                'terms': {
                    'field': 'collection_id',
                    'size': 30
                }
            }
        },
        '_source': False
    }
    # import json
    # print json.dumps(q, indent=2)
    result = get_es().search(index=get_es_index(),
                             body=q,
                             doc_type=TYPE_DOCUMENT)

    aggs = result.get('aggregations', {}).get('collections', {})
    buckets = aggs.get('buckets', [])
    q = Collection.all_by_ids([b['key'] for b in buckets])
    q = q.filter(Collection.creator_id != None)  # noqa
    objs = {o.id: o for o in q.all()}
    roles = {}
    for bucket in buckets:
        collection = objs.get(bucket.get('key'))
        if collection is None or collection.private:
            continue
        if collection.creator_id in roles:
            roles[collection.creator_id]['total'] += bucket.get('doc_count')
        else:
            roles[collection.creator_id] = {
                'name': collection.creator.name,
                'email': collection.creator.email,
                'total': bucket.get('doc_count')
            }

    roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True)
    roles = [format_total(r) for r in roles]
    total = result.get('hits', {}).get('total')
    return format_total({'roles': roles, 'active': total > 0, 'total': total})
Пример #6
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}})
    q = {'query': q, 'size': 150}

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    collections = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['collections'] = []
        for coll in document['collection_id']:
            if coll not in authz.collections(authz.READ):
                continue
            if coll not in collections:
                collections[coll] = Collection.by_id(coll)
            if collections[coll] is None:
                continue
            document['collections'].append(collections[coll])
        document['records'] = {'results': [], 'total': 0}
        output['results'].append(document)
    return output
Пример #7
0
def peek_query(args):
    if not isinstance(args, MultiDict):
        args = MultiDict(args)
    text = args.get('q', '').strip()
    q = text_query(text)

    filters = parse_filters(args)
    for entity in args.getlist('entity'):
        filters.append(('entities.id', entity))

    q = filter_query(q, filters, [])
    q = add_filter(q, {
        'not': {
            'terms': {
                'collection_id': authz.collections(authz.READ)
            }
        }
    })
    q = {
        'query': q,
        'size': 0,
        'aggregations': {
            'collections': {
                'terms': {'field': 'collection_id', 'size': 30}
            }
        },
        '_source': False
    }
    # import json
    # print json.dumps(q, indent=2)
    result = get_es().search(index=get_es_index(), body=q,
                             doc_type=TYPE_DOCUMENT)

    aggs = result.get('aggregations', {}).get('collections', {})
    buckets = aggs.get('buckets', [])
    q = Collection.all_by_ids([b['key'] for b in buckets])
    q = q.filter(Collection.creator_id != None)  # noqa
    objs = {o.id: o for o in q.all()}
    roles = {}
    for bucket in buckets:
        collection = objs.get(bucket.get('key'))
        if collection is None or collection.private:
            continue
        if collection.creator_id in roles:
            roles[collection.creator_id]['total'] += bucket.get('doc_count')
        else:
            roles[collection.creator_id] = {
                'name': collection.creator.name,
                'email': collection.creator.email,
                'total': bucket.get('doc_count')
            }

    roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True)
    roles = [format_total(r) for r in roles]
    total = result.get('hits', {}).get('total')
    return format_total({
        'roles': roles,
        'active': total > 0,
        'total': total
    })
Пример #8
0
def tabular_query(document_id, sheet, args):
    scored = False
    q = match_all()
    text = args.get('q', '').strip()
    if len(text):
        scored = True
        q = text_query_string(text)

    try:
        rows = [int(r) for r in args.getlist('row')]
    except Exception:
        rows = []

    if len(rows):
        scored = True
        q = {
            "bool": {
                "must": [q],
                "should": [{
                    "constant_score": {
                        "filter": {
                            'terms': {
                                'row_id': rows
                            }
                        },
                        "boost": 1000
                    }
                }]
            }
        }

    q = add_filter(q, {'term': {'document_id': document_id}})
    q = add_filter(q, {'term': {'sheet': sheet}})

    # pprint(q)

    sort = [{'row_id': 'asc'}]
    if scored:
        sort.insert(0, '_score')
    return {
        'from': 0,
        'size': 100,
        'query': q,
        'sort': sort,
        '_source': ['document_id', 'sheet', 'row_id', 'raw']
    }
Пример #9
0
def authz_filter(q, authz, roles=False):
    if authz.is_admin:
        return q

    fq = {'terms': {'collection_id': list(authz.collections_read)}}
    if roles:
        fq = {"or": [{'terms': {'roles': list(authz.roles)}}, fq]}

    return add_filter(q, fq)
Пример #10
0
def authz_filter(q, authz, roles=False):
    if authz.is_admin:
        return q

    fq = {'terms': {'collection_id': list(authz.collections_read)}}

    if roles:
        iq = {'terms': {'roles': list(authz.roles)}}
        fq = {'bool': {'should': [iq, fq], 'minimum_should_match': 1}}
    return add_filter(q, fq)
Пример #11
0
def tabular_query(document_id, sheet, args):
    scored = False
    q = match_all()
    text = args.get('q', '').strip()
    if len(text):
        scored = True
        q = text_query_string(text)

    try:
        rows = [int(r) for r in args.getlist('row')]
    except Exception:
        rows = []

    if len(rows):
        scored = True
        q = {
            "bool": {
                "must": q,
                "should": {
                    "constant_score": {
                        "filter": {'terms': {'row_id': rows}},
                        "boost": 1000
                    }
                }
            }
        }

    q = add_filter(q, {'term': {'document_id': document_id}})
    q = add_filter(q, {'term': {'sheet': sheet}})

    # pprint(q)

    sort = [{'row_id': 'asc'}]
    if scored:
        sort.insert(0, '_score')
    return {
        'from': 0,
        'size': 100,
        'query': q,
        'sort': sort,
        '_source': ['document_id', 'sheet', 'row_id', 'raw']
    }
Пример #12
0
def documents_query(args, fields=None, facets=True, newer_than=None):
    """Parse a user query string, compose and execute a query."""
    if not isinstance(args, MultiDict):
        args = MultiDict(args)
    text = args.get('q', '').strip()
    q = text_query(text)
    q = authz_filter(q)

    if newer_than is not None:
        q = add_filter(q, {
            "range": {
                "created_at": {
                    "gt": newer_than
                }
            }
        })

    # Sorting -- should this be passed into search directly, instead of
    # these aliases?
    sort_mode = args.get('sort', '').strip().lower()
    if text or sort_mode == 'score':
        sort = ['_score']
    elif sort_mode == 'newest':
        sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score']
    elif sort_mode == 'oldest':
        sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score']
    else:
        sort = [{'updated_at': 'desc'}, {'created_at': 'desc'}, '_score']

    # Extract filters, given in the form: &filter:foo_field=bla_value
    filters = []
    for key in args.keys():
        for value in args.getlist(key):
            if not key.startswith('filter:'):
                continue
            _, field = key.split(':', 1)
            filters.append((field, value))

    for entity in args.getlist('entity'):
        filters.append(('entities.uuid', entity))

    aggs = {}
    if facets:
        aggs = aggregate(q, args, filters)
        aggs = facet_source(q, aggs, filters)
        q = entity_collections(q, aggs, args, filters)

    return {
        'sort': sort,
        'query': filter_query(q, filters),
        'aggregations': aggs,
        '_source': fields or DEFAULT_FIELDS
    }
Пример #13
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {
            "range": {
                "created_at": {
                    "gt": alert.notified_at
                }
            }
        })
    q = {
        'query': q,
        'size': 150
    }

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    sub_queries = []
    collections = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['collections'] = []
        for coll in document['collection_id']:
            if coll not in authz.collections(authz.READ):
                continue
            if coll not in collections:
                collections[coll] = Collection.by_id(coll)
            if collections[coll] is None:
                continue
            document['collections'].append(collections[coll])
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], alert.to_query(), size=1)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Пример #14
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_sources_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {
            "range": {
                "created_at": {
                    "gt": alert.notified_at
                }
            }
        })
    q = {
        'query': q,
        'size': 150
    }

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    sub_queries = []
    sources = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        source_id = document['source_id']
        if source_id not in sources:
            sources[source_id] = Source.by_id(source_id)
        if sources[source_id] is None:
            continue
        document['source'] = sources[source_id]
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], alert.to_query(), size=1)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Пример #15
0
def documents_query(state, fields=None, facets=True, since=None):
    """Parse a user query string, compose and execute a query."""
    # This used to be several functions, but it's actually incredibly
    # procedural and so it's been linearised into one function. To really
    # clean this up, I think it should be based around an object model of
    # some sort.
    q = text_query(state.text)

    if state.raw_query:
        q = {"bool": {"must": [q, state.raw_query]}}

    q = authz_filter(q, state.authz, roles=False)

    # Used by alerting to find only updated results:
    if since is not None:
        q = add_filter(q, {"range": {"created_at": {"gt": since}}})

    # Sorting
    if state.sort == 'newest':
        sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score']
    if state.sort == 'oldest':
        sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score']
    else:
        sort = ['_score']

    # TODO: find a better way to handle "slightly special" aggregations like
    # entities and collections.
    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    if facets:
        facets = list(state.facet_names)
        if 'collections' in facets:
            aggs = facet_collections(state, q, aggs)
            facets.remove('collections')
        if 'entities' in facets:
            aggs = facet_entities(state, aggs)
            facets.remove('entities')
        aggs = aggregate(state, q, aggs, facets)

    # allow plug-ins to post-process the query.
    signals.document_query_process.send(q=q, state=state)

    q = {
        'sort': sort,
        'size': state.limit,
        'from': state.offset,
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        '_source': fields or DEFAULT_FIELDS
    }
    result, hits, output = execute_basic(TYPE_DOCUMENT, q)

    # This will add labels and other contextual information.
    output['facets'] = parse_facet_result(state, result)

    # After the main query has run, a sub-query will be run for each returned
    # result in order to find relevant records for result highlighting.
    sub_shoulds = records_query_shoulds(state)
    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}
        collection_id = document.get('collection_id')
        try:
            # FIXME: there's got to be a nicer way of doing this....
            document['public'] = state.authz.collection_public(collection_id)
        except:
            document['public'] = None

        sq = records_query_internal(document['id'], sub_shoulds)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        output['results'].append(document)

    if not len(sub_queries):
        return output

    body = '\n'.join(sub_queries)
    res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body=body)
    for doc in output['results']:
        for sq in res.get('responses', []):
            sqhits = sq.get('hits', {})
            doc['records']['total'] = sqhits.get('total', 0)
            for hit in sqhits.get('hits', {}):
                record = hit.get('_source')
                if doc['id'] != record['document_id']:
                    continue
                hlt = hit.get('highlight', {})
                texts = hlt.get('text', []) or hlt.get('text_latin', [])
                texts = [clean_highlight(t) for t in texts]
                texts = [t for t in texts if len(t)]
                if len(texts):
                    record['text'] = texts[0]
                    doc['records']['results'].append(record)

    return output