Exemplo n.º 1
0
def _query_item(entity):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    query = {
        "query": query,
        "size": 100,
        "_source": {
            "includes": PROXY_INCLUDES
        }
    }
    matchable = list(entity.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    for result in result.get("hits").get("hits"):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        if score >= SCORE_CUTOFF:
            log.debug("Match: %s <[%.2f]> %s", entity.caption, score,
                      match.caption)
            yield score, entity, result.get("collection_id"), match
Exemplo n.º 2
0
def match_query(proxy, source_collection_id=None, collection_ids=None,
                query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity and source collection_id:
    must_not = []
    if proxy.id is not None:
        must_not.append({"ids": {"values": [proxy.id]}})
    if source_collection_id is not None:
        must_not.append({'term': {'collection_id': source_collection_id}})
    if len(must_not):
        query['bool']['must_not'].extend(must_not)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    filters = []
    for (prop, value) in proxy.itervalues():
        specificity = prop.specificity(value)
        if specificity > 0:
            filters.append((prop, value, specificity))

    filters = sorted(filters, key=lambda p: p[2], reverse=True)
    required = []
    for (prop, value, specificity) in filters:
        if prop.type in REQUIRED and len(required) <= MAX_CLAUSES:
            required.extend(_make_queries(prop, value, specificity))

    scoring = []
    for (prop, value, specificity) in filters:
        clauses = len(required) + len(scoring)
        if prop.type not in REQUIRED and clauses <= MAX_CLAUSES:
            scoring.extend(_make_queries(prop, value, specificity))

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': required,
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Exemplo n.º 3
0
Arquivo: match.py Projeto: pudo/aleph
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    filters = []
    for (prop, value) in proxy.itervalues():
        specificity = prop.specificity(value)
        if specificity > 0:
            filters.append((prop, value, specificity))

    filters = sorted(filters, key=lambda p: p[2], reverse=True)
    required = []
    for (prop, value, specificity) in filters:
        if prop.type in REQUIRED and len(required) <= MAX_CLAUSES:
            required.extend(_make_queries(prop, value, specificity))

    scoring = []
    for (prop, value, specificity) in filters:
        clauses = len(required) + len(scoring)
        if prop.type not in REQUIRED and clauses <= MAX_CLAUSES:
            scoring.extend(_make_queries(prop, value, specificity))

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': [required],
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Exemplo n.º 4
0
def _query_item(entity, entitysets=True):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    log.debug("Candidate [%s]: %s", entity.schema.name, entity.caption)
    entityset_ids = EntitySet.entity_entitysets(entity.id) if entitysets else []
    query = {"query": query, "size": 50, "_source": ENTITY_SOURCE}
    index = entities_read_index(schema=list(entity.schema.matchable_schemata))
    result = es.search(index=index, body=query)
    for result in result.get("hits").get("hits"):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption)
        yield score, entity, result.get("collection_id"), match, entityset_ids
Exemplo n.º 5
0
def xref_item(proxy):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    result = search_safe(index=entities_index(), body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            yield score, result.get('collection_id'), other
Exemplo n.º 6
0
def _query_item(collection, entity):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    query = {'query': query, 'size': 100, '_source': {'includes': INCLUDES}}
    matchable = list(entity.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    for result in result.get('hits').get('hits'):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        if score >= SCORE_CUTOFF:
            # log.debug('Match: %r <-[%.3f]-> %r',
            #           entity.caption, score, match.caption)
            yield score, entity, result.get('collection_id'), match
Exemplo n.º 7
0
Arquivo: xref.py Projeto: pudo/aleph
def xref_item(proxy, collection_ids=None):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy, collection_ids=collection_ids)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    matchable = list(proxy.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            if score >= SCORE_CUTOFF:
                yield score, result.get('collection_id'), other
Exemplo n.º 8
0
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    required = []
    scoring = []
    for (prop, value) in proxy.itervalues():
        queries = list(_make_queries(prop, value))
        if prop.type in REQUIRED:
            required.extend(queries)
        else:
            scoring.extend(queries)

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': [required],
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Exemplo n.º 9
0
def xref_item(proxy, collection_ids=None):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy, collection_ids=collection_ids)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    matchable = list(proxy.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            if score >= SCORE_CUTOFF:
                yield score, result.get('collection_id'), other
Exemplo n.º 10
0
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    # Attempt to find only matches within the "matchable" set of
    # entity schemata. For example, a Company and be matched to
    # another company or a LegalEntity, but not a Person.
    # Real estate is "unmatchable", i.e. even if two plots of land
    # have almost the same name and criteria, it does not make
    # sense to suggest they are the same.
    if proxy.schema.name != Entity.THING:
        matchable = [s.name for s in proxy.schema.matchable_schemata]
        if not len(matchable):
            return none_query()

        query['bool']['must'].append({
            "terms": {"schema": matchable}
        })

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['must'].append({
            'terms': {'collection_id': collection_ids}
        })

    required = []
    for name in proxy.names:
        required.append({
            'match': {
                'names.text': {
                    'query': name,
                    'operator': 'and',
                    'minimum_should_match': '60%',
                }
            }
        })
        fp = fingerprints.generate(name)
        if fp is not None:
            required.append({
                'match': {
                    'fingerprints': {
                        'query': fp,
                        'fuzziness': 1,
                        'operator': 'and',
                        'boost': 3.0
                    }
                }
            })

    for type_ in registry.types:
        if not type_.strong or type_.group is None:
            continue
        for value in proxy.get_type_values(type_):
            required.append({
                'term': {
                    type_.group: {
                        'value': value,
                        'boost': 3.0
                    }
                }
            })

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        "bool": {
            "should": required,
            "minimum_should_match": 1
        }
    })
    return query