Exemplo n.º 1
0
def qa_matrix(index, sources_config):
    #mapping = es.indices.get_mapping(index=index)
    fields = BaseItem.combined_index_fields
    sources = load_sources_config(sources_config)

    #pprint(fields)

    all_body = {
        'query': {
            'constant_score': {
                'query': {'match_all': {}}
            }
        },
        'aggs': {
            'source_id': {
                'terms': {'field': 'meta.source_id', 'size': len(sources)},
            }
        },
        'size': 0
    }
    all_result = es.search(index=index, body=all_body)

    all_counts = {b['key']: {
        'all': b['doc_count']} for b in
        all_result['aggregations']['source_id']['buckets']}

    for field in fields:
        body = {
            'query': {
                'filtered': {
                    'query': {
                        'constant_score': {
                            'query': {'match_all': {}}
                        }
                    },
                    'filter': {
                        'exists': {'field': field}
                    }
                }
            },
            'aggs': {
                'source_id': {
                    'terms': {'field': 'meta.source_id', 'size': len(sources)},
                }
            },
            'size': 0
        }
        result = es.search(index=index, body=body)
        for b in result['aggregations']['source_id']['buckets']:
            all_counts[b['key']][field] = b['doc_count']
        # print field
        # pprint(result)

    pprint(all_counts)

    print ("{:<24}" * (len(fields) + 2)).format(*(sorted(['Source', 'all'] + fields.keys())))
    #print "{:<8} {:<15} {:<10}".format('Source',)
    for source_id, counts in all_counts.iteritems():
        print ("{:<24}" * (len(fields) + 2)).format(*([source_id, counts['all']] + [counts.get(c, 0) for c in sorted(fields)]))
Exemplo n.º 2
0
    def api_request(self, index_name, doc_type, query=None, *args, **kwargs):
        api_query = {
            "filters": {},
            "from": 0,
            "size": 10,
            "sort": "_score",
            "order": "asc"
        }

        kwargs['@type'] = doc_type

        if query is not None:
            api_query["query"] = query

        for k, v in kwargs.iteritems():
            if k in api_query:
                api_query[k] = v
            else:
                if isinstance(v, basestring):
                    api_query["filters"][k] = {"terms": [v]}
                elif isinstance(v, list):
                    api_query["filters"][k] = {"terms": v}
                else:
                    api_query["filters"][k] = v

        return es.search(index=index_name, body=api_query)
Exemplo n.º 3
0
    def enrich_item(self, enrichments, object_id, combined_index_doc, doc):
        enrichments['percolations'] = {}
        for item in combined_index_doc.get('item', {}).get('items', []):
            if item.get('@type',
                        'Note') not in settings.ENRICHER_PERCOLATOR_AS2_TYPES:
                continue

            result = es.search(index=settings.COMBINED_INDEX,
                               body={
                                   "query": {
                                       "percolate": {
                                           "field":
                                           "query",
                                           "document_type":
                                           item.get('@type', 'Note'),
                                           "document": {
                                               'item': item
                                           }
                                       }
                                   }
                               })
            # log.info('Percolated item:')
            # log.info(item)
            # log.info('Percolating result:')
            # log.info(result)
            if result.get('hits', {}).get('total', 0) > 0:
                tags = [
                    '%s/%s' % (
                        settings.ENRICHER_PERCOLATOR_BASE_HREF,
                        h['_id'],
                    ) for h in result['hits']['hits']
                ]
                enrichments['percolations'][item['@id']] = tags

        log.info('Percolation final results:')
        log.info(enrichments['percolations'])
        return enrichments
Exemplo n.º 4
0
def qa_matrix(index, sources_config):
    #mapping = es.indices.get_mapping(index=index)
    fields = BaseItem.combined_index_fields
    sources = load_sources_config(sources_config)

    #pprint(fields)

    all_body = {
        'query': {
            'constant_score': {
                'query': {
                    'match_all': {}
                }
            }
        },
        'aggs': {
            'source_id': {
                'terms': {
                    'field': 'meta.source_id',
                    'size': len(sources)
                },
            }
        },
        'size': 0
    }
    all_result = es.search(index=index, body=all_body)

    all_counts = {
        b['key']: {
            'all': b['doc_count']
        }
        for b in all_result['aggregations']['source_id']['buckets']
    }

    for field in fields:
        body = {
            'query': {
                'filtered': {
                    'query': {
                        'constant_score': {
                            'query': {
                                'match_all': {}
                            }
                        }
                    },
                    'filter': {
                        'exists': {
                            'field': field
                        }
                    }
                }
            },
            'aggs': {
                'source_id': {
                    'terms': {
                        'field': 'meta.source_id',
                        'size': len(sources)
                    },
                }
            },
            'size': 0
        }
        result = es.search(index=index, body=body)
        for b in result['aggregations']['source_id']['buckets']:
            all_counts[b['key']][field] = b['doc_count']
        # print field
        # pprint(result)

    pprint(all_counts)

    print("{:<24}" * (len(fields) + 2)).format(*(sorted(['Source', 'all'] +
                                                        fields.keys())))
    #print "{:<8} {:<15} {:<10}".format('Source',)
    for source_id, counts in all_counts.iteritems():
        print("{:<24}" * (len(fields) + 2)).format(
            *([source_id, counts['all']] +
              [counts.get(c, 0) for c in sorted(fields)]))