def suggest_entities(args): """Auto-complete API.""" text = args.get('prefix') min_count = int(args.get('min_count', 0)) options = [] if text is not None and len(text.strip()): q = { 'bool': { 'must': [ {'match_phrase_prefix': {'terms': text.strip()}}, {'range': {'doc_count': {'gte': min_count}}} ] } } q = { 'size': 5, 'sort': [{'doc_count': 'desc'}, '_score'], 'query': authz_collections_filter(q), '_source': ['name', '$schema', 'terms', 'doc_count'] } ref = latinize_text(text) result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [latinize_text(t) for t in ent.pop('terms', [])] ent['match'] = ref in terms ent['id'] = res.get('_id') options.append(ent) return { 'text': text, 'results': options }
def suggest_entities(prefix, min_count=0, schemas=None, size=5): """Auto-complete API.""" options = [] if prefix is not None and len(prefix.strip()): q = { 'match_phrase_prefix': {'terms': prefix.strip()} } if min_count > 0: q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}}) if schemas is not None and len(schemas): q = add_filter(q, {'terms': {'$schema': schemas}}) q = { 'size': size, 'sort': [{'doc_count': 'desc'}, '_score'], 'query': authz_filter(q), '_source': ['name', '$schema', 'terms', 'doc_count'] } ref = latinize_text(prefix) result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [latinize_text(t) for t in ent.pop('terms', [])] ent['match'] = ref in terms ent['score'] = res.get('_score') ent['id'] = res.get('_id') options.append(ent) return { 'prefix': prefix, 'results': options }
def suggest_entities(prefix, authz, min_count=0, schemas=None, size=5): """Auto-complete API.""" options = [] if prefix is not None and len(prefix.strip()): q = { 'match_phrase_prefix': {'name': prefix.strip()} } if min_count > 0: q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}}) if schemas is not None and len(schemas): q = add_filter(q, {'terms': {'$schema': schemas}}) # TODO: is this correct? should we allow filter by dataset entities? q = add_filter(q, {'terms': {'collection_id': authz.collections_read}}) q = { 'size': size, 'sort': [{'doc_count': 'desc'}, '_score'], 'query': q, '_source': ['name', 'schema', 'fingerprints', 'doc_count'] } ref = latinize_text(prefix) result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [latinize_text(t) for t in ent.pop('fingerprints', [])] ent['match'] = ref in terms ent['score'] = res.get('_score') ent['id'] = res.get('_id') options.append(ent) return { 'prefix': prefix, 'results': options }
def index_document(document, index_records=True): log.info("Index document: %r", document) data = document.to_index_dict() data['text'] = get_text(document) data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document.id) bulk_op(generate_records(document))
def index_document(document, index_records=True): log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document.id) bulk_op(generate_records(document))
def index_entity(entity): """Index an entity.""" data = entity.to_dict() data.pop('id', None) data['doc_count'] = get_count(entity) data['terms'] = entity.terms data['terms_latin'] = [latinize_text(t) for t in entity.terms] data['name_latin'] = latinize_text(data.get('name')) data['summary_latin'] = latinize_text(data.get('summary')) data['description_latin'] = latinize_text(data.get('description')) data = expand_json(data) get_es().index(index=get_es_index(), doc_type=TYPE_ENTITY, id=entity.id, body=data)
def generate_records(document): """Generate index records, based on document rows or pages.""" if document.type == Document.TYPE_TEXT: for page in document.pages: tid = sha1(str(document.id)) tid.update(str(page.id)) tid = tid.hexdigest() text = string_value(page.text) latin = latinize_text(text) yield { '_id': tid, '_type': TYPE_RECORD, '_index': six.text_type(es_index), '_parent': document.id, '_source': { 'type': 'page', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_id, 'page': page.number, 'text': text, 'text_latin': latin } } elif document.type == Document.TYPE_TABULAR: for record in document.records: data = {k: string_value(v) for (k, v) in record.data.items()} text = [v for v in data.values() if v is not None] latin = [latinize_text(t) for t in text] latin = [t for t in latin if t not in text] yield { '_id': record.tid, '_type': TYPE_RECORD, '_index': six.text_type(es_index), '_parent': document.id, '_source': { 'type': 'row', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_id, 'row_id': record.row_id, 'sheet': record.sheet, 'text': text, 'text_latin': latin, 'raw': data } }
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk_op(generate_records(document))
def get_text(document): """Generate an array with the full text of the given document. This will limit document length to TEXT_MAX_LEN in order to avoid uploading extremely long documents. """ texts = [] for text in document.text_parts(): text = string_value(text) texts.append(text) latin = latinize_text(text) if latin != text: texts.append(latin) text_len = sum((len(t) for t in texts)) # First, try getting rid of duplicate entries, which are more likely in # tabular documents. If that does not help, partial text will be # returned. if text_len >= TEXT_MAX_LEN: texts = list(set(texts)) text_len = sum((len(t) for t in texts)) if text_len >= TEXT_MAX_LEN: return texts return texts
def generate_records(document): if document.type == Document.TYPE_TEXT: for page in document.pages: tid = sha1(str(document.id)) tid.update(str(page.id)) tid = tid.hexdigest() yield { '_id': tid, '_type': TYPE_RECORD, '_index': get_es_index(), '_parent': document.id, '_source': { 'type': 'page', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_ids, 'page': page.number, 'text': page.text, 'text_latin': latinize_text(page.text) } } elif document.type == Document.TYPE_TABULAR: for record in document.records: text = record.text latin = [latinize_text(t) for t in text] latin = [t for t in latin if t not in text] yield { '_id': record.tid, '_type': TYPE_RECORD, '_index': get_es_index(), '_parent': document.id, '_source': { 'type': 'row', 'content_hash': document.content_hash, 'document_id': document.id, 'collection_id': document.collection_ids, 'row_id': record.row_id, 'sheet': record.sheet, 'text': text, 'text_latin': latin, 'raw': record.data } }
def pending(id): collection = obj_or_404(Collection.by_id(id)) authz.require(authz.collection_read(id)) q = collection.pending_entities() q = q.limit(30) entities = [] for entity in q.all(): data = entity.to_dict() data['name_latin'] = latinize_text(entity.name, lowercase=False) entities.append(data) return jsonify({'results': entities, 'total': len(entities)})
def pending(id): collection = obj_or_404(Collection.by_id(id)) request.authz.require(request.authz.collection_read(collection)) q = collection.pending_entities() q = q.limit(30) entities = [] for entity in q.all(): data = entity.to_dict() data['name_latin'] = latinize_text(entity.name) entities.append(data) return jsonify({'results': entities, 'total': len(entities)})
def similar_entities(entity, args, collections): """Merge suggestions API.""" shoulds = [] for term in entity.terms: shoulds.append({ 'multi_match': { "fields": ["name^50", "terms^25", "summary^5"], "query": term, "fuzziness": 2 } }) shoulds.append({ 'multi_match': { "fields": ["name_latin^10", "terms_latin^5", "summary_latin"], "query": latinize_text(term), "fuzziness": 2 } }) q = { "bool": { "should": shoulds, "must_not": { "ids": { "values": [entity.id] } }, "must": { "terms": { "collection_id": collections } }, "minimum_should_match": 1 } } q = { 'size': 10, 'query': authz_filter(q), '_source': DEFAULT_FIELDS } options = [] result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): entity = res.get('_source') entity['id'] = res.get('_id') entity['score'] = res.get('_score') entity['api_url'] = url_for('entities_api.view', id=res.get('_id')) options.append(entity) return { 'results': options }
def text_query_string(text, literal=False): if text is None or not len(text.strip()): return match_all() if literal: text = '"%s"' % latinize_text(text) return { 'query_string': { 'query': text, 'fields': ['text^6', 'text_latin^2'], 'default_operator': 'AND', 'use_dis_max': True } }
def index_document(document_id, index_records=True): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document) bulk(get_es(), generate_records(document), stats_only=True, chunk_size=1000, request_timeout=500.0)
def pending(): q = db.session.query(Entity) q = q.filter(Entity.state == Entity.STATE_PENDING) clause = Collection.id.in_(authz.collections(authz.READ)) q = q.filter(Entity.collections.any(clause)) ref = aliased(Reference) q = q.join(ref) q = q.group_by(Entity) q = q.order_by(func.sum(ref.weight).desc()) entity = q.first() if entity is None: return jsonify({'empty': True}) data = entity.to_dict() data['name_latin'] = latinize_text(data['name'], lowercase=False) return jsonify(data)
def meta_query_string(text, literal=False): if text is None or not len(text.strip()): return match_all() if literal: text = '"%s"' % latinize_text(text) return { "query_string": { "query": text, "fields": ['title^15', 'file_name', 'summary^10', 'title_latin^12', 'summary_latin^8'], "default_operator": "AND", "use_dis_max": True } }
def similar_entities(entity, args): """Merge suggestions API.""" shoulds = [] for term in entity.terms: shoulds.append({ 'multi_match': { "fields": ["name^50", "terms^25", "summary^5"], "query": term, "fuzziness": 2 } }) shoulds.append({ 'multi_match': { "fields": ["name_latin^10", "terms_latin^5", "summary_latin"], "query": latinize_text(term), "fuzziness": 2 } }) q = { "bool": { "should": shoulds, "must_not": { "ids": { "values": [entity.id] } }, "minimum_should_match": 1 } } q = { 'size': 10, 'query': authz_collections_filter(q), '_source': DEFAULT_FIELDS } options = [] result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): entity = res.get('_source') entity['id'] = res.get('_id') entity['score'] = res.get('_score') entity['api_url'] = url_for('entities_api.view', id=res.get('_id')) options.append(entity) return { 'results': options }
def pending(): q = db.session.query(Entity) skip_entities = request.args.getlist('skip') if len(skip_entities): q = q.filter(not_(Entity.id.in_(skip_entities))) q = q.filter(Entity.state == Entity.STATE_PENDING) clause = Collection.id.in_(authz.collections(authz.READ)) q = q.filter(Entity.collections.any(clause)) # this was too slow to actually work: # ref = aliased(Reference) # q = q.join(ref) # q = q.group_by(Entity) # q = q.order_by(func.count(ref.id).desc()) q = q.order_by(func.random()) q = q.limit(30) entities = [] for entity in q.all(): data = entity.to_dict() data['name_latin'] = latinize_text(entity.name, lowercase=False) entities.append(data) return jsonify({'results': entities, 'total': len(entities)})
def normalize_value(self, value): value = collapse_spaces(value) return value, latinize_text(value)
def finalize_index(data, schema): """Apply final denormalisations to the index.""" properties = data.get('properties', {}) text = set() for vs in properties.values(): for v in ensure_list(vs): v = string_value(v) if v is None or len(v) < 2: continue v = v.strip() text.add(v) v = latinize_text(v) text.add(v) # v = category_replace(v) # text.add(v) data['text'] = list(text) data['fingerprints'] = data.get('fingerprints', []) # Generate inverted representations of the data stored in properties. for prop in schema.properties: values = properties.get(prop.name, []) if not len(values): continue # Find an set the name property if prop.is_label: data['name'] = values[0] # Generate key material # TODO: this should probably be record-based. data['fingerprints'].extend(prop.type.fingerprint(values)) # Add inverted properties. This takes all the properties # of a specific type (names, dates, emails etc.) invert = prop.type.index_invert if invert: if invert not in data: data[invert] = [] for norm in prop.type.normalize(values): if norm not in data[invert]: data[invert].append(norm) data['fingerprints'] = list(set(data['fingerprints'])) # Add latinised names names = data.get('names', []) for name in list(names): names.append(latinize_text(name)) data['names'] = list(set(names)) # Get implied schemata (i.e. parents of the actual schema) data['schema'] = schema.name data['schemata'] = [] for parent in schema.schemata: if not parent.hidden: data['schemata'].append(parent.name) # Second name field for non-tokenised sorting. if 'name' in data: data['name_sort'] = data.get('name') return data