def bulk_write(collection, items): """Write a set of entities - given as raw dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data") entity = model.get_proxy(item) if entity.id is None: raise InvalidData("No ID for entity") if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} if len(entities): index.index_bulk(collection.id, entities)
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. now = datetime.utcnow().isoformat() aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if not unsafe: entity = remove_checksums(entity) entity.context = { 'role_id': role_id, 'created_at': now, 'updated_at': now, } writer.put(entity, origin='bulk') if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def bulk_write(collection, items, merge=True): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def add(self, prop, values, cleaned=False, quiet=False): """Add the given value(s) to the property if they are not empty.""" prop = self._get_prop(prop, quiet=quiet) if prop is None: return # Don't allow setting the reverse properties: if prop.stub: if quiet: return msg = gettext("Stub property (%s): %s") raise InvalidData(msg % (self.schema, prop)) for value in ensure_list(values): if not cleaned: value = prop.type.clean(value, countries=self.countries) if value is None or not isinstance(value, Hashable): continue if prop.type == registry.entity and value == self.id: msg = gettext("Self-relationship (%s): %s") raise InvalidData(msg % (self.schema, prop)) # Somewhat hacky: limit the maximum size of any particular # field to avoid overloading upstream aleph/elasticsearch. value_size = prop.type.values_size(value) if prop.type.max_size is not None: if self._size + value_size > prop.type.max_size: msg = "[%s] too large. Rejecting additional values." log.warning(msg, prop.name) continue self._size += value_size if prop not in self._properties: self._properties[prop] = set() self._properties[prop].add(value)
def add(self, prop, values, cleaned=False, quiet=False, fuzzy=False): """Add the given value(s) to the property if they are not empty.""" prop_name = self._prop_name(prop, quiet=quiet) if prop_name is None: return prop = self.schema.properties[prop_name] # Don't allow setting the reverse properties: if prop.stub: if quiet: return msg = gettext("Stub property (%s): %s") raise InvalidData(msg % (self.schema, prop)) for value in value_list(values): if not cleaned: value = prop.type.clean(value, proxy=self, fuzzy=fuzzy) if value is None: continue if prop.type == registry.entity and value == self.id: msg = gettext("Self-relationship (%s): %s") raise InvalidData(msg % (self.schema, prop)) # Somewhat hacky: limit the maximum size of any particular # field to avoid overloading upstream aleph/elasticsearch. value_size = len(value) if prop.type.max_size is not None: if self._size + value_size > prop.type.max_size: # msg = "[%s] too large. Rejecting additional values." # log.warning(msg, prop.name) continue self._size += value_size self._properties.setdefault(prop_name, set()) self._properties[prop_name].add(value)
def _generate(): for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) if not unsafe: entity = remove_checksums(entity) yield _process_entity(entity)
def precise_schema(self, left, right): """Select the most narrow of two schemata. When indexing data from a dataset, an entity may be declared as a LegalEntity in one query, and as a Person in another. This function will select the most specific of two schemata offered. In the example, that would be Person. """ if left == right: return left lefts = self.get(left) if lefts is None: return right if right in lefts.names: return left rights = self.get(right) if rights is None: return left if left in rights.names: return right # Find a common ancestor: for left in lefts.names: for right in rights.names: if left == right: return left raise InvalidData("No common ancestor: %s and %s" % (left, right))
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: entity = model.get_proxy(data, cleaned=False) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if safe: entity = remove_checksums(entity) entity.context = {"role_id": role_id, "mutable": mutable} for field in ("created_at", "updated_at"): timestamp = data.get(field) if timestamp is not None: dt = registry.date.to_datetime(timestamp) if dt is not None: entity.context[field] = dt.isoformat() writer.put(entity, origin="bulk") if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def from_dict(cls, model, data): if isinstance(data, cls): return data schema = model.get(data.get('schema')) if schema is None: raise InvalidData(gettext('No schema for entity.')) return cls(schema, data.get('id'), data.get('properties'))
def common_schema(self, left, right): """Select the most narrow of two schemata. When indexing data from a dataset, an entity may be declared as a LegalEntity in one query, and as a Person in another. This function will select the most specific of two schemata offered. In the example, that would be Person. """ left = self.get(left) or self.get(right) right = self.get(right) or self.get(left) left_schemata = list(left.schemata) right_schemata = list(right.schemata) if right in left_schemata: return left if left in right_schemata: return right # Find a common ancestor: for left in left_schemata: for right in right_schemata: if left == right: return left msg = "No common ancestor: %s and %s" raise InvalidData(msg % (left, right))
def __init__(self, model, data, key_prefix=None, cleaned=True): data = dict(data) properties = data.pop("properties", {}) if not cleaned: properties = ensure_dict(properties) self.schema = model.get(data.pop("schema", None)) if self.schema is None: raise InvalidData(gettext("No schema for entity.")) self.key_prefix = key_prefix self.id = data.pop("id", None) if not cleaned: self.id = sanitize_text(self.id) self.context = data self._properties = {} self._size = 0 for key, value in properties.items(): if key not in self.schema.properties: continue if not cleaned: self.add(key, value, cleaned=cleaned, quiet=True) else: values = set(value) self._properties[key] = values self._size += sum([len(v) for v in values])
def add_schema(self, schema: Union[str, Schema]) -> None: """Try to apply the given schema to the current entity, making it more specific (e.g. turning a `LegalEntity` into a `Company`). This raises an exception if the current and new type are incompatible.""" try: self.schema = model.common_schema(self.schema, schema) except InvalidData as exc: raise InvalidData(f"{self.id}: {exc}") from exc
def create(cls, data, collection, validate=True): entity = cls() entity_id = data.get('id') or make_textid() if not registry.entity.validate(entity_id): raise InvalidData(gettext("Invalid entity ID")) entity.id = collection.ns.sign(entity_id) entity.collection_id = collection.id entity.update(data, collection, validate=validate) return entity
def _get_prop(self, prop, quiet=False): if isinstance(prop, Property): return prop if prop not in self.schema.properties: if quiet: return msg = gettext("Unknown property (%s): %s") raise InvalidData(msg % (self.schema, prop)) return self.schema.get(prop)
def add(self, prop, values, cleaned=False, quiet=False, fuzzy=False): """Add the given value(s) to the property if they are valid for the type of the property. :param prop: can be given as a name or an instance of :class:`~followthemoney.property.Property`. :param values: either a single value, or a list of values to be added. :param cleaned: should the data be normalised before adding it. :param quiet: a reference to an non-existent property will return an empty list instead of raising an error. :param fuzzy: when normalising the data, should fuzzy matching be allowed. """ prop_name = self._prop_name(prop, quiet=quiet) if prop_name is None: return prop = self.schema.properties[prop_name] # Don't allow setting the reverse properties: if prop.stub: if quiet: return msg = gettext("Stub property (%s): %s") raise InvalidData(msg % (self.schema, prop)) for value in value_list(values): if not cleaned: value = prop.type.clean(value, proxy=self, fuzzy=fuzzy) if value is None: continue if prop.type == registry.entity and value == self.id: msg = gettext("Self-relationship (%s): %s") raise InvalidData(msg % (self.schema, prop)) # Somewhat hacky: limit the maximum size of any particular # field to avoid overloading upstream aleph/elasticsearch. value_size = len(value) if prop.type.max_size is not None: if self._size + value_size > prop.type.max_size: # msg = "[%s] too large. Rejecting additional values." # log.warning(msg, prop.name) continue self._size += value_size self._properties.setdefault(prop_name, set()) self._properties[prop_name].add(value)
def _process_entity(entity, sync=False): """Perform pre-index processing on an entity, includes running the NLP pipeline.""" if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) if sync: refresh_entity_id(entity.id) # log.debug("Index: %r", entity) return entity
def create(cls, data, collection, role_id=None): entity = cls() entity_id = data.get("id") or make_textid() if not registry.entity.validate(entity_id): raise InvalidData(gettext("Invalid entity ID")) entity.id = collection.ns.sign(entity_id) entity.collection_id = collection.id entity.role_id = role_id entity.update(data, collection) return entity
def validate_entity(data): """Check that there is a valid schema and all FtM conform to it.""" schema = model.get(data.get("schema")) if schema is None: raise InvalidData(gettext("No schema on entity")) # This isn't strictly required because the proxy will contain # only those values that can be inserted for each property, # making it valid -- all this does, therefore, is to raise an # exception that notifies the user. schema.validate(data)
def merge(self, other): model = self.schema.model other = self.from_dict(model, other) self.id = self.id or other.id try: self.schema = model.common_schema(self.schema, other.schema) except InvalidData as e: msg = "Cannot merge entities with id %s: %s" raise InvalidData(msg % (self.id, e)) self.context.update(other.context) for prop, value in set(other.itervalues()): self.add(prop, value, cleaned=True, quiet=True)
def _prop_name(self, prop, quiet=False): # This is pretty unwound because it gets called a *lot*. if prop in self.schema.properties: return prop try: if prop.name in self.schema.properties: return prop.name except AttributeError: pass if quiet: return msg = gettext("Unknown property (%s): %s") raise InvalidData(msg % (self.schema, prop))
def merge(self, other): model = self.schema.model self.id = self.id or other.id try: self.schema = model.common_schema(self.schema, other.schema) except InvalidData as e: msg = "Cannot merge entities with id %s: %s" raise InvalidData(msg % (self.id, e)) self.context = merge_context(self.context, other.context) for prop, values in other._properties.items(): self.add(prop, values, cleaned=True, quiet=True) return self
def validate(self, data): """Validate a dataset against the given schema. This will also drop keys which are not present as properties. """ errors = {} properties = ensure_dict(data.get('properties')) for name, prop in self.properties.items(): values = properties.get(name) error = prop.validate(values) if error is not None: errors[name] = error if len(errors): raise InvalidData({'properties': errors})
def common_schema(self, left: Union[str, Schema], right: Union[str, Schema]) -> Schema: """Select the most narrow of two schemata. When indexing data from a dataset, an entity may be declared as a LegalEntity in one query, and as a Person in another. This function will select the most specific of two schemata offered. In the example, that would be Person. """ left_schema = self.get(left) or self.get(right) right_schema = self.get(right) or self.get(left) if left_schema is None or right_schema is None: raise InvalidData("Invalid schema") if left_schema.is_a(right_schema): return left_schema if right_schema.is_a(left_schema): return right_schema # for schema in self.schemata.values(): # if schema.is_a(left) and schema.is_a(right): # return schema msg = "No common schema: %s and %s" raise InvalidData(msg % (left, right))
def _prop_name(self, prop: P, quiet: bool = False) -> Optional[str]: # This is pretty unwound because it gets called a *lot*. if prop in self.schema.properties: return cast(str, prop) try: obj = cast(Property, prop) if obj.name in self.schema.properties: return obj.name except AttributeError: pass if quiet: return None msg = gettext("Unknown property (%s): %s") raise InvalidData(msg % (self.schema, prop))
def __init__( self, type_: PropertyType, value: str, proxy: Optional[EntityProxy] = None, schema: Optional[Schema] = None, ) -> None: self.type = type_ self.value = value _id = type_.node_id_safe(value) if _id is None: raise InvalidData("No ID for node") self.id = _id self.proxy = proxy self.schema = schema if proxy is None else proxy.schema
def __init__(self, model, data, key_prefix=None): data = dict(data) properties = ensure_dict(data.pop('properties', {})) self.schema = model.get(data.pop('schema', None)) if self.schema is None: raise InvalidData(gettext('No schema for entity.')) self.id = sanitize_text(data.pop('id', None)) self.key_prefix = sanitize_text(key_prefix) self.context = data self._properties = {} self._size = 0 if is_mapping(properties): for key, value in properties.items(): self.add(key, value, cleaned=True, quiet=True)
def __init__( self, model: "Model", data: Dict[str, Any], key_prefix: Optional[str] = None, cleaned: bool = True, ): data = dict(data or {}) properties = data.pop("properties", {}) if not cleaned: properties = ensure_dict(properties) #: The schema definition for this entity, which implies the properties #: That can be set on it. schema = model.get(data.pop("schema", None)) if schema is None: raise InvalidData(gettext("No schema for entity.")) self.schema = schema #: When using :meth:`~make_id` to generate a natural key for this entity, #: the prefix will be added to the ID as a salt to make it easier to keep #: IDs unique across datasets. This is somewhat redundant following the #: introduction of :class:`~followthemoney.namespace.Namespace`. self.key_prefix = key_prefix #: A unique identifier for this entity, usually a hashed natural key, #: a UUID, or a very simple slug. Can be signed using a #: :class:`~followthemoney.namespace.Namespace`. self.id = data.pop("id", None) if not cleaned: self.id = sanitize_text(self.id) #: If the input dictionary for the entity proxy contains fields other #: than ``id``, ``schema`` or ``properties``, they will be kept in here #: and re-added upon serialization. self.context = data self._properties: Dict[str, Set[str]] = {} self._size = 0 for key, value in properties.items(): if key not in self.schema.properties: continue if not cleaned: self.add(key, value, cleaned=cleaned, quiet=True) else: values = set(value) self._properties[key] = values self._size += sum([len(v) for v in values])
def _normalize_data(data): """Turn entities in properties into entity ids""" entities = data['layout']['entities'] for obj in entities: schema = model.get(obj.get('schema')) if schema is None: raise InvalidData("Invalid schema %s" % obj.get('schema')) properties = obj.get('properties', {}) for name, values in list(properties.items()): prop = schema.get(name) if prop.type == registry.entity: properties[prop.name] = [] for value in ensure_list(values): entity_id = get_entity_id(value) properties[prop.name].append(entity_id) return data
def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: """Specific types can apply their own cleaning routines here (this is called by ``clean`` after the value has been converted to a string and null values have been filtered).""" if proxy is not None and text == proxy.id: msg = gettext("Self-relationship (%s): %s") raise InvalidData(msg % (proxy.schema, text)) if self.REGEX.match(text) is not None: return text return None
def merge(self, other: "EntityProxy") -> "EntityProxy": """Merge another entity proxy into this one. This will try and find the common schema between both entities and then add all property values from the other entity into this one.""" model = self.schema.model self.id = self.id or other.id try: self.schema = model.common_schema(self.schema, other.schema) except InvalidData as e: msg = "Cannot merge entities with id %s: %s" raise InvalidData(msg % (self.id, e)) self.context = merge_context(self.context, other.context) for prop, values in other._properties.items(): self.add(prop, values, cleaned=True, quiet=True) return self