Пример #1
0
 def test_compare_countries(self):
     left = {
         "schema": "Person",
         "properties": {"name": ["Frank Banana"], "nationality": ["ie"]},
     }
     data = {"schema": "Person", "properties": {"name": ["Frank Banana"]}}
     no_country = model.get_proxy(data)
     baseline = compare(model, left, no_country)
     self.assertGreater(compare(model, left, left), baseline)
Пример #2
0
    def test_compare_quality(self):
        best_score = compare(model, ENTITY, ENTITY)
        reduced = deepcopy(ENTITY)
        reduced['properties'].pop('birthDate')
        reduced['properties'].pop('idNumber')
        self.assertLess(compare(model, ENTITY, reduced), best_score)

        reduced = deepcopy(ENTITY)
        reduced['properties']['name'] = 'Frank Banana'
        self.assertLess(compare(model, ENTITY, reduced), best_score)
Пример #3
0
    def test_compare_basic(self):
        best_score = compare(ENTITY, ENTITY)
        assert best_score > 0.5, best_score
        comp = {'id': 'bla', 'schema': 'RealEstate'}
        assert compare(ENTITY, comp) == 0
        assert compare(comp, comp) == 0

        reduced = deepcopy(ENTITY)
        reduced['properties'].pop('birthDate')
        assert compare(ENTITY, reduced) < best_score
Пример #4
0
    def test_compare_quality(self):
        best_score = compare(model, ENTITY, ENTITY)
        reduced = deepcopy(ENTITY)
        reduced["properties"].pop("birthDate")
        reduced["properties"].pop("idNumber")
        self.assertLess(compare(model, ENTITY, reduced), best_score)

        reduced = deepcopy(ENTITY)
        reduced["properties"]["name"] = ["Frank Banana"]
        self.assertLess(compare(model, ENTITY, reduced), best_score)
Пример #5
0
    def test_compare_basic(self):
        best_score = compare(model, ENTITY, ENTITY)
        assert best_score > 0.5, best_score
        comp = {'schema': 'RealEstate'}
        self.assertEqual(compare(model, ENTITY, comp), 0)
        self.assertEqual(compare(model, comp, comp), 0)

        comp = {'schema': 'Person'}
        self.assertEqual(compare(model, ENTITY, comp), 0)

        comp = {'schema': 'LegalEntity'}
        self.assertEqual(compare(model, ENTITY, comp), 0)
Пример #6
0
    def test_compare_basic(self):
        best_score = compare(model, ENTITY, ENTITY)
        assert best_score > 0.5, best_score
        comp = {"schema": "RealEstate"}
        self.assertAlmostEqual(compare(model, ENTITY, comp), 0)
        self.assertAlmostEqual(compare(model, comp, comp), 0)

        comp = {"schema": "Person"}
        self.assertAlmostEqual(compare(model, ENTITY, comp), 0)

        comp = {"schema": "LegalEntity"}
        self.assertAlmostEqual(compare(model, ENTITY, comp), 0)
Пример #7
0
 def test_compare_countries(self):
     left = {
         'schema': 'Person',
         'properties': {
             'name': ['Frank Banana'],
             'nationality': ['ie']
         }
     }
     data = {'schema': 'Person', 'properties': {'name': ['Frank Banana']}}
     no_country = model.get_proxy(data)
     baseline = compare(model, left, no_country)
     self.assertGreater(compare(model, left, left), baseline)
Пример #8
0
    def test_compare_quality(self):
        entity = model.get_proxy(ENTITY)
        best_score = compare(model, entity, entity)
        reduced = deepcopy(ENTITY)
        reduced["properties"].pop("birthDate")
        reduced["properties"].pop("idNumber")
        reduced_proxy = model.get_proxy(reduced)
        self.assertLess(compare(model, entity, reduced_proxy), best_score)

        reduced = deepcopy(ENTITY)
        reduced["properties"]["name"] = ["Frank Banana"]
        reduced_proxy = model.get_proxy(reduced)
        self.assertLess(compare(model, entity, reduced_proxy), best_score)
Пример #9
0
    def test_compare_basic(self):
        entity = model.get_proxy(ENTITY)
        best_score = compare(model, entity, entity)
        assert best_score > 0.5, best_score
        comp = model.get_proxy({"schema": "RealEstate"})
        self.assertAlmostEqual(compare(model, entity, comp), 0)
        self.assertAlmostEqual(compare(model, comp, comp), 0)

        comp = model.get_proxy({"schema": "Person"})
        self.assertAlmostEqual(compare(model, entity, comp), 0)

        comp = model.get_proxy({"schema": "LegalEntity"})
        self.assertAlmostEqual(compare(model, entity, comp), 0)
Пример #10
0
def _query_item(entity):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    query = {
        "query": query,
        "size": 100,
        "_source": {
            "includes": PROXY_INCLUDES
        }
    }
    matchable = list(entity.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    for result in result.get("hits").get("hits"):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        if score >= SCORE_CUTOFF:
            log.debug("Match: %s <[%.2f]> %s", entity.caption, score,
                      match.caption)
            yield score, entity, result.get("collection_id"), match
Пример #11
0
def similar(profile_id):
    """
    ---
    get:
      summary: Get similar entities
      description: >
        Get a list of similar entities to the profile with id `profile_id`
      parameters:
      - in: path
        name: profile_id
        required: true
        schema:
          type: string
      - in: query
        name: 'filter:schema'
        schema:
          items:
            type: string
          type: array
      - in: query
        name: 'filter:schemata'
        schema:
          items:
            type: string
          type: array
      responses:
        '200':
          description: Returns a list of entities
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EntitiesResponse'
      tags:
      - Profile
    """
    # enable_cache()
    profile = obj_or_404(get_profile(profile_id, authz=request.authz))
    require(request.authz.can(profile.get("collection_id"),
                              request.authz.READ))
    tag_request(collection_id=profile.get("collection_id"))
    exclude = [item["entity_id"] for item in profile["items"]]
    result = MatchQuery.handle(request,
                               entity=profile["merged"],
                               exclude=exclude)
    entities = list(result.results)
    result.results = []
    for obj in entities:
        item = {
            "score": compare(model, profile["merged"], obj),
            "judgement": Judgement.NO_JUDGEMENT,
            "collection_id": profile.get("collection_id"),
            "entity": obj,
        }
        result.results.append(item)
    return SimilarSerializer.jsonify_result(result)
Пример #12
0
def similar(entity_id):
    """
    ---
    get:
      summary: Get similar entities
      description: >
        Get a list of similar entities to the entity with id `entity_id`
      parameters:
      - in: path
        name: entity_id
        required: true
        schema:
          type: string
      - in: query
        name: 'filter:schema'
        schema:
          items:
            type: string
          type: array
      - in: query
        name: 'filter:schemata'
        schema:
          items:
            type: string
          type: array
      responses:
        '200':
          description: Returns a list of scored and judged entities
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/SimilarResponse'
      tags:
      - Entity
    """
    # enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get("collection_id"))
    proxy = model.get_proxy(entity)
    result = MatchQuery.handle(request, entity=proxy)
    entities = list(result.results)
    pairs = [(entity_id, s.get("id")) for s in entities]
    judgements = pairwise_judgements(pairs, entity.get("collection_id"))
    result.results = []
    for obj in entities:
        item = {
            "score": compare(model, proxy, obj),
            "judgement": judgements.get((entity_id, obj.get("id"))),
            "collection_id": entity.get("collection_id"),
            "entity": obj,
        }
        result.results.append(item)
    return SimilarSerializer.jsonify_result(result)
Пример #13
0
def xref_item(proxy):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    result = search_safe(index=entities_index(), body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            yield score, result.get('collection_id'), other
Пример #14
0
def _query_item(entity, entitysets=True):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    log.debug("Candidate [%s]: %s", entity.schema.name, entity.caption)
    entityset_ids = EntitySet.entity_entitysets(entity.id) if entitysets else []
    query = {"query": query, "size": 50, "_source": ENTITY_SOURCE}
    index = entities_read_index(schema=list(entity.schema.matchable_schemata))
    result = es.search(index=index, body=query)
    for result in result.get("hits").get("hits"):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption)
        yield score, entity, result.get("collection_id"), match, entityset_ids
Пример #15
0
def _query_item(collection, entity):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(entity)
    if query == none_query():
        return

    query = {'query': query, 'size': 100, '_source': {'includes': INCLUDES}}
    matchable = list(entity.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    for result in result.get('hits').get('hits'):
        result = unpack_result(result)
        if result is None:
            continue
        match = model.get_proxy(result)
        score = compare(model, entity, match)
        if score >= SCORE_CUTOFF:
            # log.debug('Match: %r <-[%.3f]-> %r',
            #           entity.caption, score, match.caption)
            yield score, entity, result.get('collection_id'), match
Пример #16
0
Файл: xref.py Проект: pudo/aleph
def xref_item(proxy, collection_ids=None):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy, collection_ids=collection_ids)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    matchable = list(proxy.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            if score >= SCORE_CUTOFF:
                yield score, result.get('collection_id'), other
Пример #17
0
def xref_item(proxy, collection_ids=None):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy, collection_ids=collection_ids)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    matchable = list(proxy.schema.matchable_schemata)
    index = entities_read_index(schema=matchable)
    result = es.search(index=index, body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            if score >= SCORE_CUTOFF:
                yield score, result.get('collection_id'), other
Пример #18
0
    def dedupe(cls, session, threshold=0.5):
        entities = []
        for entity in cls.all(session):
            proxy = entity.proxy
            if not proxy.schema.matchable:
                continue
            entities.append(proxy)
        log.info("Loaded %s matchable entities", len(entities))
        compares = 0
        for (a, b) in combinations(entities, 2):
            if a.id >= b.id:
                continue
            compares += 1
            if compares % 10000 == 0:
                log.info("Comparisons: %s", compares)
                session.commit()

            score = compare(model, a, b)
            if score > threshold:
                log.info("Potential match [%s]: %s ./. %s", score, a, b)
                # TODO: priority
                Match.save(session, a, b, score=score)
                Match.save(session, b, a, score=score)
Пример #19
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser(
        {
            'limit': query.get('limit', '5'),
            'strict': 'false'
        }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or Entity.THING
    proxy = model.make_entity(schema)
    proxy.add('name', query.get('query', ''))
    for p in query.get('properties', []):
        proxy.add(p.get('pid'), p.get('v'), quiet=True)

    query = MatchQuery(parser, entity=proxy)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        entity = unpack_result(doc)
        if entity is None:
            continue
        entity = model.get_proxy(entity)
        score = math.ceil(compare(model, proxy, entity) * 100)
        match = {
            'id': entity.id,
            'name': entity.caption,
            'score': score,
            'uri': entity_url(entity.id),
            'match': False
        }
        for type_ in get_freebase_types():
            if entity.schema.name == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {'result': matches, 'num': len(matches)}
Пример #20
0
 def score(self):
     if self._score is not None:
         return self._score
     if self.entity and self.canonical:
         self._score = compare(self.model, self.entity, self.canonical)
         return self._score
Пример #21
0
 def score(self):
     if self.subject is None or self.candidate is None:
         return 0.0
     if self.subject.id == self.candidate.id:
         return 1.0
     return compare(model, self.subject, self.candidate)
Пример #22
0
def benchmark():
    proxy = create_proxy()
    for i in range(10_000):
        compare.compare(model, proxy, proxy)