def match_query(proxy, source_collection_id=None, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity and source collection_id: must_not = [] if proxy.id is not None: must_not.append({"ids": {"values": [proxy.id]}}) if source_collection_id is not None: must_not.append({'term': {'collection_id': source_collection_id}}) if len(must_not): query['bool']['must_not'].extend(must_not) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) filters = [] for (prop, value) in proxy.itervalues(): specificity = prop.specificity(value) if specificity > 0: filters.append((prop, value, specificity)) filters = sorted(filters, key=lambda p: p[2], reverse=True) required = [] for (prop, value, specificity) in filters: if prop.type in REQUIRED and len(required) <= MAX_CLAUSES: required.extend(_make_queries(prop, value, specificity)) scoring = [] for (prop, value, specificity) in filters: clauses = len(required) + len(scoring) if prop.type not in REQUIRED and clauses <= MAX_CLAUSES: scoring.extend(_make_queries(prop, value, specificity)) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': required, 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) filters = [] for (prop, value) in proxy.itervalues(): specificity = prop.specificity(value) if specificity > 0: filters.append((prop, value, specificity)) filters = sorted(filters, key=lambda p: p[2], reverse=True) required = [] for (prop, value, specificity) in filters: if prop.type in REQUIRED and len(required) <= MAX_CLAUSES: required.extend(_make_queries(prop, value, specificity)) scoring = [] for (prop, value, specificity) in filters: clauses = len(required) + len(scoring) if prop.type not in REQUIRED and clauses <= MAX_CLAUSES: scoring.extend(_make_queries(prop, value, specificity)) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': [required], 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) required = [] scoring = [] for (prop, value) in proxy.itervalues(): queries = list(_make_queries(prop, value)) if prop.type in REQUIRED: required.extend(queries) else: scoring.extend(queries) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': [required], 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def iter_entities_by_ids(ids, authz=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" for i in range(0, len(ids), MAX_PAGE): chunk = ids[i:i + MAX_PAGE] if not len(chunk): return query = bool_query() query['bool']['filter'].append({'ids': {'values': chunk}}) if authz is not None: query['bool']['filter'].append(authz_query(authz)) includes = ['schema', 'properties', 'collection_id', 'created_at'] query = { 'query': query, '_source': {'includes': includes}, 'size': min(MAX_PAGE, len(chunk) * 2) } result = search_safe(index=entity_index(), body=query, request_cache=False) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: yield entity
def entities_by_ids(ids, authz=None, schemata=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" for i in range(0, len(ids), MAX_PAGE): chunk = ids[i:i + MAX_PAGE] if not len(chunk): return query = bool_query() query['bool']['filter'].append({'ids': {'values': chunk}}) if authz is not None: query['bool']['filter'].append(authz_query(authz)) query = { 'query': query, '_source': { 'excludes': ['text'] }, 'size': min(MAX_PAGE, len(chunk)) } index = entities_read_index(schema=schemata) result = search_safe(index=index, body=query, ignore=[404]) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: yield entity
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) # Attempt to find only matches within the "matchable" set of # entity schemata. For example, a Company and be matched to # another company or a LegalEntity, but not a Person. # Real estate is "unmatchable", i.e. even if two plots of land # have almost the same name and criteria, it does not make # sense to suggest they are the same. if proxy.schema.name != Entity.THING: matchable = [s.name for s in proxy.schema.matchable_schemata] if not len(matchable): return none_query() query['bool']['must'].append({ "terms": {"schema": matchable} }) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['must'].append({ 'terms': {'collection_id': collection_ids} }) required = [] for name in proxy.names: required.append({ 'match': { 'names.text': { 'query': name, 'operator': 'and', 'minimum_should_match': '60%', } } }) fp = fingerprints.generate(name) if fp is not None: required.append({ 'match': { 'fingerprints': { 'query': fp, 'fuzziness': 1, 'operator': 'and', 'boost': 3.0 } } }) for type_ in registry.types: if not type_.strong or type_.group is None: continue for value in proxy.get_type_values(type_): required.append({ 'term': { type_.group: { 'value': value, 'boost': 3.0 } } }) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ "bool": { "should": required, "minimum_should_match": 1 } }) return query