def load_mapping(collection, mapping_id, sync=False): """Flush and reload all entities generated by a mapping.""" mapping = Mapping.by_id(mapping_id) if mapping is None: return log.error("Could not find mapping: %s", mapping_id) origin = mapping_origin(mapping.id) aggregator = get_aggregator(collection) aggregator.delete(origin=origin) delete_entities(collection.id, origin=origin, sync=True) if mapping.disabled: return log.info("Mapping is disabled: %s", mapping_id) publish( Events.LOAD_MAPPING, params={ "collection": collection, "table": mapping.table_id }, channels=[collection, mapping.role], actor_id=mapping.role_id, ) try: map_to_aggregator(collection, mapping, aggregator) aggregate_model(collection, aggregator) index_aggregator(collection, aggregator, sync=sync) mapping.set_status(status=Status.SUCCESS) db.session.commit() except Exception as exc: mapping.set_status(status=Status.FAILED, error=str(exc)) db.session.commit() aggregator.delete(origin=origin) finally: aggregator.close()
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. now = datetime.utcnow().isoformat() aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if not unsafe: entity = remove_checksums(entity) entity.context = { 'role_id': role_id, 'created_at': now, 'updated_at': now, } writer.put(entity, origin='bulk') if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def update_entity(collection, entity_id=None): """Update xref and aggregator after an entity has been edited.""" from aleph.logic.xref import xref_entity from aleph.logic.profiles import profile_fragments log.info("[%s] Update entity: %s", collection, entity_id) entity = index.get_entity(entity_id) proxy = model.get_proxy(entity) if collection.casefile: xref_entity(collection, proxy) aggregator = get_aggregator(collection, origin=MODEL_ORIGIN) profile_fragments(collection, aggregator, entity_id=entity_id) # Inline name properties from adjacent entities. See the # docstring on `inline_names` for a more detailed discussion. prop = proxy.schema.get("namesMentioned") if prop is not None: entity_ids = proxy.get_type_values(registry.entity) names = set() for related in index.entities_by_ids(entity_ids): related = model.get_proxy(related) names.update(related.get_type_values(registry.name)) if len(names) > 0: name_proxy = model.make_entity(proxy.schema) name_proxy.id = proxy.id name_proxy.add(prop, names) aggregator.put(name_proxy, fragment="names") index_aggregator(collection, aggregator, entity_ids=[entity_id]) refresh_entity(collection, proxy.id)
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: entity = model.get_proxy(data, cleaned=False) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if safe: entity = remove_checksums(entity) entity.context = {"role_id": role_id, "mutable": mutable} for field in ("created_at", "updated_at"): timestamp = data.get(field) if timestamp is not None: dt = registry.date.to_datetime(timestamp) if dt is not None: entity.context[field] = dt.isoformat() writer.put(entity, origin="bulk") if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE): """Project the contents of the collections aggregator into the index.""" if entity_ids is not None: entity_ids = ensure_list(entity_ids) # WEIRD: Instead of indexing a single entity, this will try # pull a whole batch of them off the queue and do it at once. tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids))) for task in tasks: entity_ids.extend(ensure_list(task.payload.get("entity_ids"))) stage.mark_done(len(tasks)) aggregator = get_aggregator(collection) index_aggregator(collection, aggregator, entity_ids=entity_ids, sync=sync) refresh_collection(collection.id)
def save_entityset_item(entityset, collection, entity_id, **data): """Change the association between an entity and an entityset. In the case of a profile, this may require re-indexing of the entity to update the associated profile_id. """ item = EntitySetItem.save(entityset, entity_id, collection_id=collection.id, **data) if entityset.type == EntitySet.PROFILE and entityset.collection_id == collection.id: from aleph.logic.profiles import profile_fragments aggregator = get_aggregator(collection) profile_fragments(collection, aggregator, entity_id=entity_id) index_aggregator(collection, aggregator, entity_ids=[entity_id]) refresh_entity(collection, entity_id) refresh_entityset(entityset.id) return item