def doctype_generator(doctype): if check_mapping(doctype) == "mixed_mapping": field = "doctype.keyword" elif check_mapping(doctype) == "new_mapping": field = "doctype" elif check_mapping(doctype) == None: _logger.warning( "Could not find mapping of doctype, please check whether you are using the correct doctype" ) return [] query = {'query': {'term': {field: doctype}}} for num, doc in enumerate(_scroll_query(query)): if not _DATABASE_AVAILABLE: _logger.warning( "Could not get documents: No database instance available") break _logger.info("returning {num}".format(**locals())) yield doc
def doctype_examples(doctype, field=None, seed=42, num=10): if not _DATABASE_AVAILABLE: _logger.warning( "Could not get example documents: No database instance available") return [] if check_mapping(doctype) == "mixed_mapping": field2 = "doctype.keyword" elif check_mapping(doctype) == "new_mapping": field2 = "doctype" elif check_mapping(doctype) == None: return _logger.warning( "Could not find mapping of doctype, please check whether you are using the correct doctype" ) docs = _client.search(index=_elastic_index, body={ 'size': num, "query": { "function_score": { "query": { "term": { field2: doctype } }, "functions": [{ "random_score": { "seed": seed } }] } } }) if not field: return docs['hits']['hits'] elif type(field) == str: return [_dotkeys(doc, field) for doc in docs['hits']['hits']] else: return [{fi: _dotkeys(doc, fi) for fi in field} for doc in docs['hits']['hits']]
def doctype_fields(doctype): ''' returns a summary of fields for documents of `doctype`: field : type - count (coverage) note: As elasticsearch does not natively support an 'all fields' query, this function runs a 1000 document sample and takes the union of found keys as a proxy of fields shared by all documents. ''' if not _DATABASE_AVAILABLE: _logger.warning( "Could not get document information: No database instance available" ) return [] if check_mapping(doctype) == "mixed_mapping": field = "doctype.keyword" elif check_mapping(doctype) == "new_mapping": field = "doctype" elif check_mapping(doctype) == None: _logger.warning( "Could not find mapping of doctype, please check whether you are using the correct doctype" ) return [] from collections import Counter key_count = Counter() doc_num = _client.search(index=_elastic_index, body={'query': { "term": { field: doctype } }})['hits']['total'] mappings = _client.indices.get_mapping(_elastic_index).get( _elastic_index, {}).get('mappings', {}).get(doctype, {}).get('properties', {}) coverage = { key: _client.search(_elastic_index, body={ 'query': { 'bool': { 'filter': [{ 'exists': { 'field': key } }, { 'term': { field: doctype } }] } } }).get('hits', {}).get('total', 0) for key in mappings.keys() if key != "META" } summary = { k: { 'coverage': coverage.get(k, 'unknown') / float(doc_num), 'type': mappings[k].get('type', 'unknown') } for k in mappings.keys() if k != "META" } return summary
def doctype_last(doctype, num=1, by_field="META.ADDED", query=None): '''Returns the last document of a given doctype Input --- doctype: string The document type you whish to retrieved num: int The number of documents to retrieve by_field: string The _datetime field by which to determine the last document query : string (default None) An Elasticsearch string query to filter results. Example: query="user.screen_name:google" ''' if not _DATABASE_AVAILABLE: _logger.warning( "Could not get last documents: No database instance available") return [] exotic_by_field = by_field.replace('.', '.properties.') _logger.debug("looking for {exotic_by_field}".format( exotic_by_field=exotic_by_field)) mapping = _client.indices.get_mapping() _logger.debug("Got mapping {mapping}".format(**locals())) target_key = "{_elastic_index}.mappings.{doctype}.properties.{exotic_by_field}".format( _elastic_index=_elastic_index, **locals()) _logger.debug("Target key: {target_key}".format(**locals())) found_mapping = _dotkeys(mapping, target_key) _logger.debug("found mapping: {found_mapping}".format(**locals())) if not found_mapping: _logger.debug("Mapping not seen yet") return [] if check_mapping(doctype) == "mixed_mapping": field = "doctype.keyword" elif check_mapping(doctype) == "new_mapping": field = "doctype" elif check_mapping(doctype) == None: _logger.warning( "Could not find mapping of doctype, please check whether you are using the correct doctype" ) return [] body = { "sort": [{ by_field: { "order": "desc" } }], "size": num, "query": { "term": { field: doctype } } } if query: _logger.debug("adding string query: {query}".format(**locals())) body['query'] = {'query_string': {'query': query}} docs = _client.search(index=_elastic_index, body={ "sort": [{ by_field: { "order": "desc" } }], "size": num, "query": { "term": { field: doctype } } }).get('hits', {}).get('hits', [""]) return docs