Exemplo n.º 1
0
def get_funders_for_datasets(datasets):
    es = get_es()

    for dataset in datasets:
        query = {
            "query": {
                "bool": {
                    "filter": [{
                        "term": {
                            "filename": dataset['identifier'] + '.json'
                        }
                    }]
                }
            },
            "aggs": {
                "funders": {
                    "terms": {
                        "field": "fundingOrganization.id_and_name",
                        "size": 10
                    }
                }
            }
        }
        results = es.search(body=query, index=settings.ES_INDEX, size=0)
        dataset['funders'] = [
            json.loads(bucket['key'])
            for bucket in results['aggregations']['funders']['buckets']
        ]
Exemplo n.º 2
0
def get_results(json_query, size=10, from_=0):
    es = get_es()
    extra_context = json_query.pop('extra_context', None)
    results = es.search(body=json_query, size=size, from_=from_, index=settings.ES_INDEX)
    if extra_context is not None:
        json_query['extra_context'] = extra_context
    return results
Exemplo n.º 3
0
def get_results(json_query, size=10, from_=0):
    es = get_es()
    extra_context = json_query.pop('extra_context', None)
    results = es.search(body=json_query, size=size, from_=from_, index=settings.ES_INDEX)
    if extra_context is not None:
        json_query['extra_context'] = extra_context
    return results
Exemplo n.º 4
0
def grants_json_generator(query):
    yield '''{
    "license": "See dataset/license within each grant. This file also contains OS data © Crown copyright and database right 2016, Royal Mail data © Royal Mail copyright and Database right 2016, National Statistics data © Crown copyright and database right 2016, see http://grantnav.org/datasets/ for more information.",
    "grants": [\n'''
    es = get_es()
    for num, result in enumerate(scan(es, query, index=settings.ES_INDEX)):
        result["_source"]["dataset"] = provenance.by_identifier.get(provenance.identifier_from_filename(result['_source']['filename']), {})
        if num == 0:
            yield json.dumps(result["_source"]) + "\n"
        else:
            yield ", " + json.dumps(result["_source"]) + "\n"
    yield ']}'
Exemplo n.º 5
0
def grants_csv_generator(query):
    yield csv_layout.grant_csv_titles
    es = get_es()
    for result in scan(es, query, index=settings.ES_INDEX):
        result_with_provenance = {
            "result": result["_source"],
            "dataset": provenance.by_identifier.get(provenance.identifier_from_filename(result['_source']['filename']), {})
        }
        line = []
        for path in csv_layout.grant_csv_paths:
            line.append(get_data_from_path(path, result_with_provenance))
        yield line
Exemplo n.º 6
0
def get_funders_for_datasets(datasets):
    es = get_es()

    for dataset in datasets:
        query = {"query": {"bool": {
            "filter": [{"term": {"filename": dataset['identifier'] + '.json'}}]}},
            "aggs": {
                "funders": {"terms": {"field": "fundingOrganization.id_and_name", "size": 10}}
            }
        }
        results = es.search(body=query, index=settings.ES_INDEX, size=0)
        dataset['funders'] = [json.loads(bucket['key']) for bucket in results['aggregations']['funders']['buckets']]
Exemplo n.º 7
0
def grants_json_generator(query):
    yield '''{
    "license": "See dataset/license within each grant. This file also contains OS data © Crown copyright and database right 2016, Royal Mail data © Royal Mail copyright and Database right 2016, National Statistics data © Crown copyright and database right 2016, see http://grantnav.org/datasets/ for more information.",
    "grants": [\n'''
    es = get_es()
    for num, result in enumerate(scan(es, query, index=settings.ES_INDEX)):
        result["_source"]["dataset"] = provenance.by_identifier.get(provenance.identifier_from_filename(result['_source']['filename']), {})
        if num == 0:
            yield json.dumps(result["_source"]) + "\n"
        else:
            yield ", " + json.dumps(result["_source"]) + "\n"
    yield ']}'
Exemplo n.º 8
0
def grants_csv_generator(query):
    yield csv_layout.grant_csv_titles
    es = get_es()
    for result in scan(es, query, index=settings.ES_INDEX):
        result_with_provenance = {
            "result": result["_source"],
            "dataset": provenance.by_identifier.get(provenance.identifier_from_filename(result['_source']['filename']), {})
        }
        line = []
        for path in csv_layout.grant_csv_paths:
            line.append(get_data_from_path(path, result_with_provenance))
        yield line
Exemplo n.º 9
0
def stats(request):
    text_query = request.GET.get('text_query')
    if not text_query:
        text_query = '*'
    context = {'text_query': text_query or ''}

    es = get_es()
    mapping = es.indices.get_mapping(index=settings.ES_INDEX)
    all_fields = list(flatten_mapping(mapping[settings.ES_INDEX]['mappings']['grant']['properties']))

    query = {"query": {"bool": {"must":
                {"query_string": {"query": text_query}}, "filter": {}}},
             "aggs": {}}

    schema = jsonref.load_uri(settings.GRANT_SCHEMA)
    schema_fields = set(flatten_schema(schema))

    for field in all_fields:
        query["aggs"][field + ":terms"] = {"terms": {"field": field, "size": 5}}
        query["aggs"][field + ":missing"] = {"missing": {"field": field}}
        query["aggs"][field + ":cardinality"] = {"cardinality": {"field": field}}

    if context['text_query'] == '*':
        context['text_query'] = ''

    field_info = collections.defaultdict(dict)
    results = es.search(body=query, index=settings.ES_INDEX, size=0)
    for field, aggregation in results['aggregations'].items():
        field_name, agg_type = field.split(':')
        field_info[field_name]["in_schema"] = field_name in schema_fields
        if agg_type == 'terms':
            field_info[field_name]["terms"] = aggregation["buckets"]
        if agg_type == 'missing':
            field_info[field_name]["found"] = results['hits']['total'] - aggregation["doc_count"]
        if agg_type == 'cardinality':
            field_info[field_name]["distinct"] = aggregation["value"]

    context['field_info'] = sorted(field_info.items(), key=lambda val: -val[1]["found"])
    context['results'] = results
    return render(request, "stats.html", context=context)
Exemplo n.º 10
0
def stats(request):
    text_query = request.GET.get('text_query')
    if not text_query:
        text_query = '*'
    context = {'text_query': text_query or ''}

    es = get_es()
    mapping = es.indices.get_mapping(index=settings.ES_INDEX)
    all_fields = list(flatten_mapping(mapping[settings.ES_INDEX]['mappings']['grant']['properties']))

    query = {"query": {"bool": {"must":
                {"query_string": {"query": text_query}}, "filter": {}}},
             "aggs": {}}

    schema = jsonref.load_uri(settings.GRANT_SCHEMA)
    schema_fields = set(flatten_schema(schema))

    for field in all_fields:
        query["aggs"][field + ":terms"] = {"terms": {"field": field, "size": 5}}
        query["aggs"][field + ":missing"] = {"missing": {"field": field}}
        query["aggs"][field + ":cardinality"] = {"cardinality": {"field": field}}

    if context['text_query'] == '*':
        context['text_query'] = ''

    field_info = collections.defaultdict(dict)
    results = es.search(body=query, index=settings.ES_INDEX, size=0)
    for field, aggregation in results['aggregations'].items():
        field_name, agg_type = field.split(':')
        field_info[field_name]["in_schema"] = field_name in schema_fields
        if agg_type == 'terms':
            field_info[field_name]["terms"] = aggregation["buckets"]
        if agg_type == 'missing':
            field_info[field_name]["found"] = results['hits']['total'] - aggregation["doc_count"]
        if agg_type == 'cardinality':
            field_info[field_name]["distinct"] = aggregation["value"]

    context['field_info'] = sorted(field_info.items(), key=lambda val: -val[1]["found"])
    context['results'] = results
    return render(request, "stats.html", context=context)