def apply_address(context: Context, entity: Entity, address: Entity): """Link the given entity to the given address.""" if address is None: return entity.add("country", address.get("country")) if address.id is not None: entity.add("addressEntity", address) context.emit(address)
def assemble(self, cached: CachedEntity, sources=Optional[Set[Dataset]]): """Build an entity proxy from a set of cached statements, considering only those statements that belong to the given sources.""" entity = None for stmt in cached[0]: if sources is not None and stmt.dataset not in sources: continue if entity is None: data = { "schema": stmt.schema, "id": stmt.canonical_id, "target": stmt.target, "first_seen": stmt.first_seen, "last_seen": stmt.last_seen, } entity = Entity(model, data) else: entity.add_schema(stmt.schema) entity.first_seen = min(entity.first_seen, stmt.first_seen) entity.last_seen = max(entity.last_seen, stmt.last_seen) entity.target = max(entity.target, stmt.target) entity.datasets.add(stmt.dataset.name) if entity is None: return None for prop in cached[1]: if sources is not None and prop.dataset not in sources: continue if prop.prop is None: continue entity.unsafe_add(prop.prop, prop.value, cleaned=True) entity.referents.update(self.resolver.get_referents(entity.id)) return entity
def reconcile_query(dataset: Dataset, query: Dict[str, Any]): """Reconcile operation for a single query.""" # log.info("Reconcile: %r", query) limit = int(query.get("limit", 5)) type = query.get("type", settings.BASE_SCHEMA) proxy = Entity(type) proxy.add("name", query.get("query")) proxy.add("notes", query.get("query")) for p in query.get("properties", []): prop = model.get_qname(p.get("pid")) if prop is None: continue try: proxy.add_cast(prop.schema, prop.name, p.get("v"), fuzzy=True) except InvalidData: log.exception("Invalid property is set.") results = [] # log.info("QUERY %r %s", proxy.to_dict(), limit) for result, score in match_entities(dataset, proxy, limit=limit, fuzzy=True): results.append(get_freebase_entity(result, score)) return {"result": results}
def reconcile_suggest_entity( dataset: str = PATH_DATASET, prefix: str = QUERY_PREFIX, limit: int = Query(10, description="Number of suggestions to return"), ): """Suggest an entity based on a text query. This is functionally very similar to the basic search API, but returns data in the structure assumed by the community specification. Searches are conducted based on name and text content, using all matchable entities in the system index.""" try: ds = get_dataset(dataset) query = Entity(settings.BASE_SCHEMA) query.add("name", prefix) query.add("notes", prefix) results = [] for result, score in match_entities(ds, query, limit=limit, fuzzy=True): results.append(get_freebase_entity(result, score)) return { "prefix": prefix, "result": results, } finally: db.session.close()
def parse_sanctions(context: Context, entity: Entity, entry): regulations = entry.findall("./regulation") # if len(regulations) == 0: # context.log.warning( # "No regulations on entity", # entity=entity, # regulations=len(regulations), # ) for regulation in regulations: url = regulation.findtext("./publicationUrl") assert url is not None, etree.tostring(regulation) sanction = h.make_sanction(context, entity, key=url) sanction.set("sourceUrl", url) sanction.add("program", regulation.get("programme")) sanction.add("reason", regulation.get("numberTitle")) sanction.add("startDate", regulation.get("entryIntoForceDate")) sanction.add("listingDate", regulation.get("publicationDate")) entity.add("createdAt", regulation.get("publicationDate")) sanction.add("unscId", entry.get("unitedNationId")) sanction.add("authorityId", entry.get("euReferenceNumber")) context.emit(sanction)
def simplify_dates(entity: Entity) -> Entity: """If an entity has multiple values for a date field, you may want to remove all those that are prefixes of others. For example, if a Person has both a birthDate of 1990 and of 1990-05-01, we'd want to drop the mention of 1990.""" for prop in entity.iterprops(): if prop.type == registry.date: dates = tuple(entity.pop(prop)) values = remove_prefix_date_values(dates) if prop.name in PROV_MAX_DATES: entity.unsafe_add(prop, max(values), cleaned=True) elif prop.name in PROV_MIN_DATES: entity.unsafe_add(prop, min(values), cleaned=True) else: for value in values: entity.unsafe_add(prop, value, cleaned=True) return entity
def emit(self, entity: Entity, target: Optional[bool] = None, unique: bool = False): """Send an FtM entity to the store.""" if entity.id is None: raise ValueError("Entity has no ID: %r", entity) if target is not None: entity.target = target statements = Statement.from_entity( entity, self.dataset, self.resolver, unique=unique ) if not len(statements): raise ValueError("Entity has no properties: %r", entity) for stmt in statements: key = (stmt["entity_id"], stmt["prop"], stmt["value"]) self._statements[key] = stmt if len(self._statements) >= db.batch_size: self.flush() self.log.debug("Emitted", entity=entity)
def emit( self, entity: Entity, target: Optional[bool] = None, external: bool = False, ): """Send an FtM entity to the store.""" if entity.id is None: raise ValueError("Entity has no ID: %r", entity) if target is not None: entity.target = target statements = statements_from_entity(entity, self.dataset, external=external) if not len(statements): raise ValueError("Entity has no properties: %r", entity) self._statements.update({s["id"]: s for s in statements}) self.log.debug("Emitted", entity=entity) if len(self._statements) >= (self.BATCH_SIZE * 10): self.flush()
async def search( q: str, dataset: str = PATH_DATASET, schema: str = Query(settings.BASE_SCHEMA, title="Types of entities that can match"), limit: int = Query(10, title="Number of results to return"), fuzzy: bool = Query(False, title="Enable n-gram matching of partial names"), nested: bool = Query(False, title="Include adjacent entities in response"), ): """Search endpoint for matching entities based on a simple piece of text, e.g. a name. This can be used to implement a simple, user-facing search. For proper entity matching, the multi-property matching API should be used instead.""" try: ds = get_dataset(dataset) query = Entity(schema) query.add("name", q) query.add("notes", q) results = query_results(ds, query, limit, fuzzy, nested) return {"results": results} finally: db.session.close()
def make(self, schema: Union[str, Schema], target=False) -> Entity: """Make a new entity with some dataset context set.""" return Entity(model, {"schema": schema, "target": target})
def assemble(self, cached: CachedEntity, sources=Optional[Set[Dataset]]): """Build an entity proxy from a set of cached statements, considering only those statements that belong to the given sources.""" entity = None for stmt in cached[0]: if sources is not None and stmt.dataset not in sources: continue if entity is None: entity = Entity(stmt.schema) entity.id = stmt.canonical_id entity.first_seen = stmt.first_seen entity.last_seen = stmt.last_seen entity.target = stmt.target else: entity.add_schema(stmt.schema) entity.first_seen = min(entity.first_seen, stmt.first_seen) entity.last_seen = max(entity.last_seen, stmt.last_seen) entity.target = max(entity.target, stmt.target) entity.datasets.add(stmt.dataset) entity.referents.add(stmt.entity_id) if entity is None: return None for prop in cached[1]: if sources is not None and prop.dataset not in sources: continue entity.unsafe_add(prop.prop, prop.value, cleaned=True) return entity
def make(self, schema: Union[str, Schema], target=False) -> Entity: """Make a new entity with some dataset context set.""" return Entity(schema, target=target)
async def search( query: EntityMatchQuery, dataset: str = PATH_DATASET, limit: int = Query(5, title="Number of results to return"), fuzzy: bool = Query(False, title="Enable n-gram matching of partial names"), nested: bool = Query(False, title="Include adjacent entities in response"), ): """Match entities based on a complex set of criteria, like name, date of birth and nationality of a person. This works by submitting a batch of entities, each formatted like those returned by the API. For example, the following would be valid query examples: ```json "queries": { "entity1": { "schema": "Person", "properties": { "name": ["John Doe"], "birthDate": ["1975-04-21"], "nationality": ["us"] } }, "entity2": { "schema": "Company", "properties": { "name": ["Brilliant Amazing Limited"], "jurisdiction": ["hk"], "registrationNumber": ["84BA99810"] } } } ``` The values for `entity1`, `entity2` can be chosen freely to correlate results on the client side when the request is returned. The responses will be given for each submitted example like this: ```json "responses": { "entity1": { "results": [...] }, "entity2": { "results": [...] } } ``` The precision of the results will be dependent on the amount of detail submitted with each example. The following properties are most helpful for particular types: * **Person**: ``name``, ``birthDate``, ``nationality``, ``idNumber``, ``address`` * **Organization**: ``name``, ``country``, ``registrationNumber``, ``address`` * **Company**: ``name``, ``jurisdiction``, ``registrationNumber``, ``address``, ``incorporationDate`` """ try: ds = get_dataset(dataset) responses = {} for name, example in query.get("queries").items(): entity = Entity(example.get("schema")) for prop, value in example.get("properties").items(): entity.add(prop, value, cleaned=False) results = query_results(ds, entity, limit, fuzzy, nested) responses[name] = {"query": entity.to_dict(), "results": results} return {"responses": responses} finally: db.session.close()