Exemplo n.º 1
0
def apply_address(context: Context, entity: Entity, address: Entity):
    """Link the given entity to the given address."""
    if address is None:
        return
    entity.add("country", address.get("country"))
    if address.id is not None:
        entity.add("addressEntity", address)
        context.emit(address)
Exemplo n.º 2
0
    def assemble(self, cached: CachedEntity, sources=Optional[Set[Dataset]]):
        """Build an entity proxy from a set of cached statements, considering
        only those statements that belong to the given sources."""
        entity = None
        for stmt in cached[0]:
            if sources is not None and stmt.dataset not in sources:
                continue
            if entity is None:
                data = {
                    "schema": stmt.schema,
                    "id": stmt.canonical_id,
                    "target": stmt.target,
                    "first_seen": stmt.first_seen,
                    "last_seen": stmt.last_seen,
                }
                entity = Entity(model, data)
            else:
                entity.add_schema(stmt.schema)
                entity.first_seen = min(entity.first_seen, stmt.first_seen)
                entity.last_seen = max(entity.last_seen, stmt.last_seen)
                entity.target = max(entity.target, stmt.target)
            entity.datasets.add(stmt.dataset.name)

        if entity is None:
            return None

        for prop in cached[1]:
            if sources is not None and prop.dataset not in sources:
                continue
            if prop.prop is None:
                continue
            entity.unsafe_add(prop.prop, prop.value, cleaned=True)

        entity.referents.update(self.resolver.get_referents(entity.id))
        return entity
Exemplo n.º 3
0
def reconcile_query(dataset: Dataset, query: Dict[str, Any]):
    """Reconcile operation for a single query."""
    # log.info("Reconcile: %r", query)
    limit = int(query.get("limit", 5))
    type = query.get("type", settings.BASE_SCHEMA)
    proxy = Entity(type)
    proxy.add("name", query.get("query"))
    proxy.add("notes", query.get("query"))
    for p in query.get("properties", []):
        prop = model.get_qname(p.get("pid"))
        if prop is None:
            continue
        try:
            proxy.add_cast(prop.schema, prop.name, p.get("v"), fuzzy=True)
        except InvalidData:
            log.exception("Invalid property is set.")

    results = []
    # log.info("QUERY %r %s", proxy.to_dict(), limit)
    for result, score in match_entities(dataset,
                                        proxy,
                                        limit=limit,
                                        fuzzy=True):
        results.append(get_freebase_entity(result, score))
    return {"result": results}
Exemplo n.º 4
0
def reconcile_suggest_entity(
        dataset: str = PATH_DATASET,
        prefix: str = QUERY_PREFIX,
        limit: int = Query(10, description="Number of suggestions to return"),
):
    """Suggest an entity based on a text query. This is functionally very
    similar to the basic search API, but returns data in the structure assumed
    by the community specification.

    Searches are conducted based on name and text content, using all matchable
    entities in the system index."""
    try:
        ds = get_dataset(dataset)
        query = Entity(settings.BASE_SCHEMA)
        query.add("name", prefix)
        query.add("notes", prefix)
        results = []
        for result, score in match_entities(ds, query, limit=limit,
                                            fuzzy=True):
            results.append(get_freebase_entity(result, score))
        return {
            "prefix": prefix,
            "result": results,
        }
    finally:
        db.session.close()
Exemplo n.º 5
0
def parse_sanctions(context: Context, entity: Entity, entry):

    regulations = entry.findall("./regulation")
    # if len(regulations) == 0:
    #     context.log.warning(
    #         "No regulations on entity",
    #         entity=entity,
    #         regulations=len(regulations),
    #     )

    for regulation in regulations:
        url = regulation.findtext("./publicationUrl")
        assert url is not None, etree.tostring(regulation)
        sanction = h.make_sanction(context, entity, key=url)
        sanction.set("sourceUrl", url)
        sanction.add("program", regulation.get("programme"))
        sanction.add("reason", regulation.get("numberTitle"))
        sanction.add("startDate", regulation.get("entryIntoForceDate"))
        sanction.add("listingDate", regulation.get("publicationDate"))
        entity.add("createdAt", regulation.get("publicationDate"))
        sanction.add("unscId", entry.get("unitedNationId"))
        sanction.add("authorityId", entry.get("euReferenceNumber"))
        context.emit(sanction)
Exemplo n.º 6
0
def simplify_dates(entity: Entity) -> Entity:
    """If an entity has multiple values for a date field, you may
    want to remove all those that are prefixes of others. For example,
    if a Person has both a birthDate of 1990 and of 1990-05-01, we'd
    want to drop the mention of 1990."""
    for prop in entity.iterprops():
        if prop.type == registry.date:
            dates = tuple(entity.pop(prop))
            values = remove_prefix_date_values(dates)
            if prop.name in PROV_MAX_DATES:
                entity.unsafe_add(prop, max(values), cleaned=True)
            elif prop.name in PROV_MIN_DATES:
                entity.unsafe_add(prop, min(values), cleaned=True)
            else:
                for value in values:
                    entity.unsafe_add(prop, value, cleaned=True)
    return entity
Exemplo n.º 7
0
 def emit(self, entity: Entity, target: Optional[bool] = None, unique: bool = False):
     """Send an FtM entity to the store."""
     if entity.id is None:
         raise ValueError("Entity has no ID: %r", entity)
     if target is not None:
         entity.target = target
     statements = Statement.from_entity(
         entity, self.dataset, self.resolver, unique=unique
     )
     if not len(statements):
         raise ValueError("Entity has no properties: %r", entity)
     for stmt in statements:
         key = (stmt["entity_id"], stmt["prop"], stmt["value"])
         self._statements[key] = stmt
     if len(self._statements) >= db.batch_size:
         self.flush()
     self.log.debug("Emitted", entity=entity)
Exemplo n.º 8
0
 def emit(
     self,
     entity: Entity,
     target: Optional[bool] = None,
     external: bool = False,
 ):
     """Send an FtM entity to the store."""
     if entity.id is None:
         raise ValueError("Entity has no ID: %r", entity)
     if target is not None:
         entity.target = target
     statements = statements_from_entity(entity,
                                         self.dataset,
                                         external=external)
     if not len(statements):
         raise ValueError("Entity has no properties: %r", entity)
     self._statements.update({s["id"]: s for s in statements})
     self.log.debug("Emitted", entity=entity)
     if len(self._statements) >= (self.BATCH_SIZE * 10):
         self.flush()
Exemplo n.º 9
0
async def search(
    q: str,
    dataset: str = PATH_DATASET,
    schema: str = Query(settings.BASE_SCHEMA,
                        title="Types of entities that can match"),
    limit: int = Query(10, title="Number of results to return"),
    fuzzy: bool = Query(False,
                        title="Enable n-gram matching of partial names"),
    nested: bool = Query(False, title="Include adjacent entities in response"),
):
    """Search endpoint for matching entities based on a simple piece of text, e.g.
    a name. This can be used to implement a simple, user-facing search. For proper
    entity matching, the multi-property matching API should be used instead."""
    try:
        ds = get_dataset(dataset)
        query = Entity(schema)
        query.add("name", q)
        query.add("notes", q)
        results = query_results(ds, query, limit, fuzzy, nested)
        return {"results": results}
    finally:
        db.session.close()
Exemplo n.º 10
0
 def make(self, schema: Union[str, Schema], target=False) -> Entity:
     """Make a new entity with some dataset context set."""
     return Entity(model, {"schema": schema, "target": target})
Exemplo n.º 11
0
    def assemble(self, cached: CachedEntity, sources=Optional[Set[Dataset]]):
        """Build an entity proxy from a set of cached statements, considering
        only those statements that belong to the given sources."""
        entity = None
        for stmt in cached[0]:
            if sources is not None and stmt.dataset not in sources:
                continue
            if entity is None:
                entity = Entity(stmt.schema)
                entity.id = stmt.canonical_id
                entity.first_seen = stmt.first_seen
                entity.last_seen = stmt.last_seen
                entity.target = stmt.target
            else:
                entity.add_schema(stmt.schema)
                entity.first_seen = min(entity.first_seen, stmt.first_seen)
                entity.last_seen = max(entity.last_seen, stmt.last_seen)
                entity.target = max(entity.target, stmt.target)
            entity.datasets.add(stmt.dataset)
            entity.referents.add(stmt.entity_id)

        if entity is None:
            return None

        for prop in cached[1]:
            if sources is not None and prop.dataset not in sources:
                continue
            entity.unsafe_add(prop.prop, prop.value, cleaned=True)
        return entity
Exemplo n.º 12
0
 def make(self, schema: Union[str, Schema], target=False) -> Entity:
     """Make a new entity with some dataset context set."""
     return Entity(schema, target=target)
Exemplo n.º 13
0
async def search(
    query: EntityMatchQuery,
    dataset: str = PATH_DATASET,
    limit: int = Query(5, title="Number of results to return"),
    fuzzy: bool = Query(False,
                        title="Enable n-gram matching of partial names"),
    nested: bool = Query(False, title="Include adjacent entities in response"),
):
    """Match entities based on a complex set of criteria, like name, date of birth
    and nationality of a person. This works by submitting a batch of entities, each
    formatted like those returned by the API.

    For example, the following would be valid query examples:

    ```json
    "queries": {
        "entity1": {
            "schema": "Person",
            "properties": {
                "name": ["John Doe"],
                "birthDate": ["1975-04-21"],
                "nationality": ["us"]
            }
        },
        "entity2": {
            "schema": "Company",
            "properties": {
                "name": ["Brilliant Amazing Limited"],
                "jurisdiction": ["hk"],
                "registrationNumber": ["84BA99810"]
            }
        }
    }
    ```
    The values for `entity1`, `entity2` can be chosen freely to correlate results
    on the client side when the request is returned. The responses will be given
    for each submitted example like this:

    ```json
    "responses": {
        "entity1": {
            "results": [...]
        },
        "entity2": {
            "results": [...]
        }
    }
    ```

    The precision of the results will be dependent on the amount of detail submitted
    with each example. The following properties are most helpful for particular types:

    * **Person**: ``name``, ``birthDate``, ``nationality``, ``idNumber``, ``address``
    * **Organization**: ``name``, ``country``, ``registrationNumber``, ``address``
    * **Company**: ``name``, ``jurisdiction``, ``registrationNumber``, ``address``,
      ``incorporationDate``
    """
    try:
        ds = get_dataset(dataset)
        responses = {}
        for name, example in query.get("queries").items():
            entity = Entity(example.get("schema"))
            for prop, value in example.get("properties").items():
                entity.add(prop, value, cleaned=False)
            results = query_results(ds, entity, limit, fuzzy, nested)
            responses[name] = {"query": entity.to_dict(), "results": results}
        return {"responses": responses}
    finally:
        db.session.close()