Exemplo n.º 1
0
def bulk_write(collection, items):
    """Write a set of entities - given as raw dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data")

        entity = model.get_proxy(item)
        if entity.id is None:
            raise InvalidData("No ID for entity")

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities)
Exemplo n.º 2
0
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    now = datetime.utcnow().isoformat()
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        if not is_mapping(data):
            raise InvalidData("Failed to read input data", errors=data)
        entity = model.get_proxy(data)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entity.context = {
            'role_id': role_id,
            'created_at': now,
            'updated_at': now,
        }
        writer.put(entity, origin='bulk')
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
Exemplo n.º 3
0
def bulk_write(collection, items, merge=True):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Exemplo n.º 4
0
    def add(self, prop, values, cleaned=False, quiet=False):
        """Add the given value(s) to the property if they are not empty."""
        prop = self._get_prop(prop, quiet=quiet)
        if prop is None:
            return

        # Don't allow setting the reverse properties:
        if prop.stub:
            if quiet:
                return
            msg = gettext("Stub property (%s): %s")
            raise InvalidData(msg % (self.schema, prop))

        for value in ensure_list(values):
            if not cleaned:
                value = prop.type.clean(value, countries=self.countries)
            if value is None or not isinstance(value, Hashable):
                continue
            if prop.type == registry.entity and value == self.id:
                msg = gettext("Self-relationship (%s): %s")
                raise InvalidData(msg % (self.schema, prop))

            # Somewhat hacky: limit the maximum size of any particular
            # field to avoid overloading upstream aleph/elasticsearch.
            value_size = prop.type.values_size(value)
            if prop.type.max_size is not None:
                if self._size + value_size > prop.type.max_size:
                    msg = "[%s] too large. Rejecting additional values."
                    log.warning(msg, prop.name)
                    continue
            self._size += value_size

            if prop not in self._properties:
                self._properties[prop] = set()
            self._properties[prop].add(value)
Exemplo n.º 5
0
    def add(self, prop, values, cleaned=False, quiet=False, fuzzy=False):
        """Add the given value(s) to the property if they are not empty."""
        prop_name = self._prop_name(prop, quiet=quiet)
        if prop_name is None:
            return
        prop = self.schema.properties[prop_name]

        # Don't allow setting the reverse properties:
        if prop.stub:
            if quiet:
                return
            msg = gettext("Stub property (%s): %s")
            raise InvalidData(msg % (self.schema, prop))

        for value in value_list(values):
            if not cleaned:
                value = prop.type.clean(value, proxy=self, fuzzy=fuzzy)
            if value is None:
                continue
            if prop.type == registry.entity and value == self.id:
                msg = gettext("Self-relationship (%s): %s")
                raise InvalidData(msg % (self.schema, prop))

            # Somewhat hacky: limit the maximum size of any particular
            # field to avoid overloading upstream aleph/elasticsearch.
            value_size = len(value)
            if prop.type.max_size is not None:
                if self._size + value_size > prop.type.max_size:
                    # msg = "[%s] too large. Rejecting additional values."
                    # log.warning(msg, prop.name)
                    continue
            self._size += value_size
            self._properties.setdefault(prop_name, set())
            self._properties[prop_name].add(value)
Exemplo n.º 6
0
 def _generate():
     for data in entities:
         if not is_mapping(data):
             raise InvalidData("Failed to read input data", errors=data)
         entity = model.get_proxy(data)
         if entity.id is None:
             raise InvalidData("No ID for entity", errors=entity.to_dict())
         if not unsafe:
             entity = remove_checksums(entity)
         yield _process_entity(entity)
Exemplo n.º 7
0
    def precise_schema(self, left, right):
        """Select the most narrow of two schemata.

        When indexing data from a dataset, an entity may be declared as a
        LegalEntity in one query, and as a Person in another. This function
        will select the most specific of two schemata offered. In the example,
        that would be Person.
        """
        if left == right:
            return left
        lefts = self.get(left)
        if lefts is None:
            return right
        if right in lefts.names:
            return left

        rights = self.get(right)
        if rights is None:
            return left
        if left in rights.names:
            return right

        # Find a common ancestor:
        for left in lefts.names:
            for right in rights.names:
                if left == right:
                    return left

        raise InvalidData("No common ancestor: %s and %s" % (left, right))
Exemplo n.º 8
0
def bulk_write(collection,
               entities,
               safe=False,
               role_id=None,
               mutable=True,
               index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        entity = model.get_proxy(data, cleaned=False)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if safe:
            entity = remove_checksums(entity)
        entity.context = {"role_id": role_id, "mutable": mutable}
        for field in ("created_at", "updated_at"):
            timestamp = data.get(field)
            if timestamp is not None:
                dt = registry.date.to_datetime(timestamp)
                if dt is not None:
                    entity.context[field] = dt.isoformat()
        writer.put(entity, origin="bulk")
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
Exemplo n.º 9
0
 def from_dict(cls, model, data):
     if isinstance(data, cls):
         return data
     schema = model.get(data.get('schema'))
     if schema is None:
         raise InvalidData(gettext('No schema for entity.'))
     return cls(schema, data.get('id'), data.get('properties'))
Exemplo n.º 10
0
    def common_schema(self, left, right):
        """Select the most narrow of two schemata.

        When indexing data from a dataset, an entity may be declared as a
        LegalEntity in one query, and as a Person in another. This function
        will select the most specific of two schemata offered. In the example,
        that would be Person.
        """
        left = self.get(left) or self.get(right)
        right = self.get(right) or self.get(left)
        left_schemata = list(left.schemata)
        right_schemata = list(right.schemata)
        if right in left_schemata:
            return left
        if left in right_schemata:
            return right

        # Find a common ancestor:
        for left in left_schemata:
            for right in right_schemata:
                if left == right:
                    return left

        msg = "No common ancestor: %s and %s"
        raise InvalidData(msg % (left, right))
Exemplo n.º 11
0
    def __init__(self, model, data, key_prefix=None, cleaned=True):
        data = dict(data)
        properties = data.pop("properties", {})
        if not cleaned:
            properties = ensure_dict(properties)
        self.schema = model.get(data.pop("schema", None))
        if self.schema is None:
            raise InvalidData(gettext("No schema for entity."))
        self.key_prefix = key_prefix
        self.id = data.pop("id", None)
        if not cleaned:
            self.id = sanitize_text(self.id)
        self.context = data
        self._properties = {}
        self._size = 0

        for key, value in properties.items():
            if key not in self.schema.properties:
                continue
            if not cleaned:
                self.add(key, value, cleaned=cleaned, quiet=True)
            else:
                values = set(value)
                self._properties[key] = values
                self._size += sum([len(v) for v in values])
Exemplo n.º 12
0
 def add_schema(self, schema: Union[str, Schema]) -> None:
     """Try to apply the given schema to the current entity, making it more
     specific (e.g. turning a `LegalEntity` into a `Company`). This raises an
     exception if the current and new type are incompatible."""
     try:
         self.schema = model.common_schema(self.schema, schema)
     except InvalidData as exc:
         raise InvalidData(f"{self.id}: {exc}") from exc
Exemplo n.º 13
0
 def create(cls, data, collection, validate=True):
     entity = cls()
     entity_id = data.get('id') or make_textid()
     if not registry.entity.validate(entity_id):
         raise InvalidData(gettext("Invalid entity ID"))
     entity.id = collection.ns.sign(entity_id)
     entity.collection_id = collection.id
     entity.update(data, collection, validate=validate)
     return entity
Exemplo n.º 14
0
 def _get_prop(self, prop, quiet=False):
     if isinstance(prop, Property):
         return prop
     if prop not in self.schema.properties:
         if quiet:
             return
         msg = gettext("Unknown property (%s): %s")
         raise InvalidData(msg % (self.schema, prop))
     return self.schema.get(prop)
Exemplo n.º 15
0
    def add(self, prop, values, cleaned=False, quiet=False, fuzzy=False):
        """Add the given value(s) to the property if they are valid for
        the type of the property.

        :param prop: can be given as a name or an instance of
            :class:`~followthemoney.property.Property`.
        :param values: either a single value, or a list of values to be added.
        :param cleaned: should the data be normalised before adding it.
        :param quiet: a reference to an non-existent property will return
            an empty list instead of raising an error.
        :param fuzzy: when normalising the data, should fuzzy matching be allowed.
        """
        prop_name = self._prop_name(prop, quiet=quiet)
        if prop_name is None:
            return
        prop = self.schema.properties[prop_name]

        # Don't allow setting the reverse properties:
        if prop.stub:
            if quiet:
                return
            msg = gettext("Stub property (%s): %s")
            raise InvalidData(msg % (self.schema, prop))

        for value in value_list(values):
            if not cleaned:
                value = prop.type.clean(value, proxy=self, fuzzy=fuzzy)
            if value is None:
                continue
            if prop.type == registry.entity and value == self.id:
                msg = gettext("Self-relationship (%s): %s")
                raise InvalidData(msg % (self.schema, prop))

            # Somewhat hacky: limit the maximum size of any particular
            # field to avoid overloading upstream aleph/elasticsearch.
            value_size = len(value)
            if prop.type.max_size is not None:
                if self._size + value_size > prop.type.max_size:
                    # msg = "[%s] too large. Rejecting additional values."
                    # log.warning(msg, prop.name)
                    continue
            self._size += value_size
            self._properties.setdefault(prop_name, set())
            self._properties[prop_name].add(value)
Exemplo n.º 16
0
def _process_entity(entity, sync=False):
    """Perform pre-index processing on an entity, includes running the
    NLP pipeline."""
    if entity.id is None:
        raise InvalidData("No ID for entity", errors=entity.to_dict())
    tag_entity(entity)
    if sync:
        refresh_entity_id(entity.id)
    # log.debug("Index: %r", entity)
    return entity
Exemplo n.º 17
0
 def create(cls, data, collection, role_id=None):
     entity = cls()
     entity_id = data.get("id") or make_textid()
     if not registry.entity.validate(entity_id):
         raise InvalidData(gettext("Invalid entity ID"))
     entity.id = collection.ns.sign(entity_id)
     entity.collection_id = collection.id
     entity.role_id = role_id
     entity.update(data, collection)
     return entity
Exemplo n.º 18
0
def validate_entity(data):
    """Check that there is a valid schema and all FtM conform to it."""
    schema = model.get(data.get("schema"))
    if schema is None:
        raise InvalidData(gettext("No schema on entity"))
    # This isn't strictly required because the proxy will contain
    # only those values that can be inserted for each property,
    # making it valid -- all this does, therefore, is to raise an
    # exception that notifies the user.
    schema.validate(data)
Exemplo n.º 19
0
    def merge(self, other):
        model = self.schema.model
        other = self.from_dict(model, other)
        self.id = self.id or other.id
        try:
            self.schema = model.common_schema(self.schema, other.schema)
        except InvalidData as e:
            msg = "Cannot merge entities with id %s: %s"
            raise InvalidData(msg % (self.id, e))

        self.context.update(other.context)
        for prop, value in set(other.itervalues()):
            self.add(prop, value, cleaned=True, quiet=True)
Exemplo n.º 20
0
 def _prop_name(self, prop, quiet=False):
     # This is pretty unwound because it gets called a *lot*.
     if prop in self.schema.properties:
         return prop
     try:
         if prop.name in self.schema.properties:
             return prop.name
     except AttributeError:
         pass
     if quiet:
         return
     msg = gettext("Unknown property (%s): %s")
     raise InvalidData(msg % (self.schema, prop))
Exemplo n.º 21
0
    def merge(self, other):
        model = self.schema.model
        self.id = self.id or other.id
        try:
            self.schema = model.common_schema(self.schema, other.schema)
        except InvalidData as e:
            msg = "Cannot merge entities with id %s: %s"
            raise InvalidData(msg % (self.id, e))

        self.context = merge_context(self.context, other.context)
        for prop, values in other._properties.items():
            self.add(prop, values, cleaned=True, quiet=True)
        return self
Exemplo n.º 22
0
 def validate(self, data):
     """Validate a dataset against the given schema.
     This will also drop keys which are not present as properties.
     """
     errors = {}
     properties = ensure_dict(data.get('properties'))
     for name, prop in self.properties.items():
         values = properties.get(name)
         error = prop.validate(values)
         if error is not None:
             errors[name] = error
     if len(errors):
         raise InvalidData({'properties': errors})
Exemplo n.º 23
0
    def common_schema(self, left: Union[str, Schema],
                      right: Union[str, Schema]) -> Schema:
        """Select the most narrow of two schemata.

        When indexing data from a dataset, an entity may be declared as a
        LegalEntity in one query, and as a Person in another. This function
        will select the most specific of two schemata offered. In the example,
        that would be Person.
        """
        left_schema = self.get(left) or self.get(right)
        right_schema = self.get(right) or self.get(left)
        if left_schema is None or right_schema is None:
            raise InvalidData("Invalid schema")
        if left_schema.is_a(right_schema):
            return left_schema
        if right_schema.is_a(left_schema):
            return right_schema
        # for schema in self.schemata.values():
        #     if schema.is_a(left) and schema.is_a(right):
        #         return schema
        msg = "No common schema: %s and %s"
        raise InvalidData(msg % (left, right))
Exemplo n.º 24
0
 def _prop_name(self, prop: P, quiet: bool = False) -> Optional[str]:
     # This is pretty unwound because it gets called a *lot*.
     if prop in self.schema.properties:
         return cast(str, prop)
     try:
         obj = cast(Property, prop)
         if obj.name in self.schema.properties:
             return obj.name
     except AttributeError:
         pass
     if quiet:
         return None
     msg = gettext("Unknown property (%s): %s")
     raise InvalidData(msg % (self.schema, prop))
Exemplo n.º 25
0
 def __init__(
     self,
     type_: PropertyType,
     value: str,
     proxy: Optional[EntityProxy] = None,
     schema: Optional[Schema] = None,
 ) -> None:
     self.type = type_
     self.value = value
     _id = type_.node_id_safe(value)
     if _id is None:
         raise InvalidData("No ID for node")
     self.id = _id
     self.proxy = proxy
     self.schema = schema if proxy is None else proxy.schema
Exemplo n.º 26
0
    def __init__(self, model, data, key_prefix=None):
        data = dict(data)
        properties = ensure_dict(data.pop('properties', {}))
        self.schema = model.get(data.pop('schema', None))
        if self.schema is None:
            raise InvalidData(gettext('No schema for entity.'))
        self.id = sanitize_text(data.pop('id', None))
        self.key_prefix = sanitize_text(key_prefix)
        self.context = data
        self._properties = {}
        self._size = 0

        if is_mapping(properties):
            for key, value in properties.items():
                self.add(key, value, cleaned=True, quiet=True)
Exemplo n.º 27
0
    def __init__(
        self,
        model: "Model",
        data: Dict[str, Any],
        key_prefix: Optional[str] = None,
        cleaned: bool = True,
    ):
        data = dict(data or {})
        properties = data.pop("properties", {})
        if not cleaned:
            properties = ensure_dict(properties)

        #: The schema definition for this entity, which implies the properties
        #: That can be set on it.
        schema = model.get(data.pop("schema", None))
        if schema is None:
            raise InvalidData(gettext("No schema for entity."))
        self.schema = schema

        #: When using :meth:`~make_id` to generate a natural key for this entity,
        #: the prefix will be added to the ID as a salt to make it easier to keep
        #: IDs unique across datasets. This is somewhat redundant following the
        #: introduction of :class:`~followthemoney.namespace.Namespace`.
        self.key_prefix = key_prefix

        #: A unique identifier for this entity, usually a hashed natural key,
        #: a UUID, or a very simple slug. Can be signed using a
        #: :class:`~followthemoney.namespace.Namespace`.
        self.id = data.pop("id", None)
        if not cleaned:
            self.id = sanitize_text(self.id)

        #: If the input dictionary for the entity proxy contains fields other
        #: than ``id``, ``schema`` or ``properties``, they will be kept in here
        #: and re-added upon serialization.
        self.context = data
        self._properties: Dict[str, Set[str]] = {}
        self._size = 0

        for key, value in properties.items():
            if key not in self.schema.properties:
                continue
            if not cleaned:
                self.add(key, value, cleaned=cleaned, quiet=True)
            else:
                values = set(value)
                self._properties[key] = values
                self._size += sum([len(v) for v in values])
Exemplo n.º 28
0
def _normalize_data(data):
    """Turn entities in properties into entity ids"""
    entities = data['layout']['entities']
    for obj in entities:
        schema = model.get(obj.get('schema'))
        if schema is None:
            raise InvalidData("Invalid schema %s" % obj.get('schema'))
        properties = obj.get('properties', {})
        for name, values in list(properties.items()):
            prop = schema.get(name)
            if prop.type == registry.entity:
                properties[prop.name] = []
                for value in ensure_list(values):
                    entity_id = get_entity_id(value)
                    properties[prop.name].append(entity_id)
    return data
Exemplo n.º 29
0
 def clean_text(
     self,
     text: str,
     fuzzy: bool = False,
     format: Optional[str] = None,
     proxy: Optional["EntityProxy"] = None,
 ) -> Optional[str]:
     """Specific types can apply their own cleaning routines here (this is called
     by ``clean`` after the value has been converted to a string and null values
     have been filtered)."""
     if proxy is not None and text == proxy.id:
         msg = gettext("Self-relationship (%s): %s")
         raise InvalidData(msg % (proxy.schema, text))
     if self.REGEX.match(text) is not None:
         return text
     return None
Exemplo n.º 30
0
    def merge(self, other: "EntityProxy") -> "EntityProxy":
        """Merge another entity proxy into this one. This will try and find
        the common schema between both entities and then add all property
        values from the other entity into this one."""
        model = self.schema.model
        self.id = self.id or other.id
        try:
            self.schema = model.common_schema(self.schema, other.schema)
        except InvalidData as e:
            msg = "Cannot merge entities with id %s: %s"
            raise InvalidData(msg % (self.id, e))

        self.context = merge_context(self.context, other.context)
        for prop, values in other._properties.items():
            self.add(prop, values, cleaned=True, quiet=True)
        return self