Exemplo n.º 1
0
    def load_fixtures(self):
        self.admin = self.create_user(foreign_id='admin', is_admin=True)
        self.private_coll = self.create_collection(
            foreign_id='test_private',
            label="Private Collection",
            category='grey',
            creator=self.admin
        )
        self._banana = self.create_entity({
            'schema': 'Person',
            'properties': {
                'name': ['Banana'],
                'birthDate': '1970-08-21'
            }
        }, self.private_coll)
        self._banana2 = self.create_entity({
            'schema': 'Person',
            'properties': {
                'name': ['Banana'],
                'birthDate': '1970-03-21'
            }
        }, self.private_coll)
        self._banana3 = self.create_entity({
            'schema': 'Person',
            'properties': {
                'name': ['Banana'],
                'birthDate': '1970-05-21'
            }
        }, self.private_coll)
        user = Role.by_foreign_id(Role.SYSTEM_USER)
        Permission.grant(self.private_coll, user, True, False)
        self.public_coll = self.create_collection(
            foreign_id='test_public',
            label="Public Collection",
            category='news',
            creator=self.admin
        )
        self._kwazulu = self.create_entity({
            'schema': 'Company',
            'properties': {
                'name': ['KwaZulu'],
                'alias': ['kwazulu']
            }
        }, self.public_coll)
        visitor = Role.by_foreign_id(Role.SYSTEM_GUEST)
        Permission.grant(self.public_coll, visitor, True, False)
        db.session.commit()

        aggregator = get_aggregator(self.public_coll)
        aggregator.delete()
        aggregator.close()
        reindex_collection(self.public_coll, sync=True)

        aggregator = get_aggregator(self.private_coll)
        aggregator.delete()
        for sample in read_entities(self.get_fixture_path('samples.ijson')):
            aggregator.put(sample, fragment='sample')
        aggregator.close()
        reindex_collection(self.private_coll, sync=True)
Exemplo n.º 2
0
def bulk_load_query(queue, collection, query_id, query):
    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source)
    if records_total:
        queue.progress.mark_pending(records_total)
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entities_count = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            entities_count += 1
            fragment = '%s-%s' % (query_id, idx)
            writer.put(entity, fragment=fragment)

        if idx > 0 and idx % 1000 == 0:
            queue.progress.mark_finished(1000)
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, idx, records_total or 'streaming',
                     entities_count)
    writer.flush()
    aggregator.close()
    log.info("[%s] Query done (%s entities)", collection.foreign_id,
             entities_count)
Exemplo n.º 3
0
def reindex_collection(collection, skip_errors=True, sync=False, flush=False):
    """Re-index all entities from the model, mappings and aggregator cache."""
    from aleph.logic.mapping import map_to_aggregator
    from aleph.logic.profiles import profile_fragments

    aggregator = get_aggregator(collection)
    for mapping in collection.mappings:
        if mapping.disabled:
            log.debug("[%s] Skip mapping: %r", collection, mapping)
            continue
        try:
            map_to_aggregator(collection, mapping, aggregator)
        except Exception:
            # More or less ignore broken models.
            log.exception("Failed mapping: %r", mapping)
    aggregate_model(collection, aggregator)
    profile_fragments(collection, aggregator)
    if flush:
        log.debug("[%s] Flushing...", collection)
        index.delete_entities(collection.id, sync=True)
    index_aggregator(collection,
                     aggregator,
                     skip_errors=skip_errors,
                     sync=sync)
    compute_collection(collection, force=True)
Exemplo n.º 4
0
def process_collection(stage,
                       collection,
                       ingest=True,
                       reset=False,
                       sync=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    ingest = ingest or reset
    if reset:
        reset_collection(collection, sync=True)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        for proxy in _collection_proxies(collection):
            writer.put(proxy, fragment='db')
            stage.report_finished(1)
        writer.flush()
        if ingest:
            for proxy in aggregator:
                ingest_entity(collection, proxy, job_id=stage.job.id)
        else:
            queue_task(collection,
                       OP_INDEX,
                       job_id=stage.job.id,
                       context={'sync': sync})
    finally:
        aggregator.close()
Exemplo n.º 5
0
def prune_entity(collection, entity_id=None, job_id=None):
    """Prune handles the full deletion of an entity outside of the HTTP request
    cycle. This involves cleaning up adjacent entities like xref results, notifications
    and so on."""
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    log.info("[%s] Prune entity: %s", collection, entity_id)
    for adjacent in index.iter_adjacent(collection.id, entity_id):
        log.warning("Recursive delete: %s", adjacent.get("id"))
        delete_entity(collection, adjacent, job_id=job_id)
    flush_notifications(entity_id, clazz=Entity)
    obj = Entity.by_id(entity_id, collection=collection)
    if obj is not None:
        obj.delete()
    doc = Document.by_id(entity_id, collection=collection)
    if doc is not None:
        doc.delete()
    EntitySetItem.delete_by_entity(entity_id)
    Mapping.delete_by_table(entity_id)
    xref_index.delete_xref(collection, entity_id=entity_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=entity_id)
    refresh_entity(collection, entity_id)
    collection.touch()
    db.session.commit()
Exemplo n.º 6
0
def delete_collection(collection, keep_metadata=False, sync=False):
    cancel_queue(collection)
    aggregator = get_aggregator(collection)
    try:
        aggregator.drop()
    finally:
        aggregator.close()
    flush_notifications(collection, sync=sync)
    index.delete_entities(collection.id, sync=sync)
    xref_index.delete_xref(collection, sync=sync)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Entity.delete_by_collection(collection.id, deleted_at=deleted_at)
    Mapping.delete_by_collection(collection.id, deleted_at=deleted_at)
    Diagram.delete_by_collection(collection.id, deleted_at=deleted_at)
    Document.delete_by_collection(collection.id)
    if not keep_metadata:
        # Considering linkages metadata for now, might be wrong:
        Linkage.delete_by_collection(collection.id)
        Permission.delete_by_collection(collection.id, deleted_at=deleted_at)
        collection.delete(deleted_at=deleted_at)
    db.session.commit()
    if not keep_metadata:
        index.delete_collection(collection.id, sync=True)
        Authz.flush()
    refresh_collection(collection.id, sync=True)
Exemplo n.º 7
0
def bulk_write(collection,
               entities,
               safe=False,
               role_id=None,
               mutable=True,
               index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        entity = model.get_proxy(data, cleaned=False)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if safe:
            entity = remove_checksums(entity)
        entity.context = {"role_id": role_id, "mutable": mutable}
        for field in ("created_at", "updated_at"):
            timestamp = data.get(field)
            if timestamp is not None:
                dt = registry.date.to_datetime(timestamp)
                if dt is not None:
                    entity.context[field] = dt.isoformat()
        writer.put(entity, origin="bulk")
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
Exemplo n.º 8
0
def update_entity(collection, entity_id=None):
    """Update xref and aggregator after an entity has been edited."""
    from aleph.logic.xref import xref_entity
    from aleph.logic.profiles import profile_fragments

    log.info("[%s] Update entity: %s", collection, entity_id)
    entity = index.get_entity(entity_id)
    proxy = model.get_proxy(entity)
    if collection.casefile:
        xref_entity(collection, proxy)

    aggregator = get_aggregator(collection, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=entity_id)

    # Inline name properties from adjacent entities. See the
    # docstring on `inline_names` for a more detailed discussion.
    prop = proxy.schema.get("namesMentioned")
    if prop is not None:
        entity_ids = proxy.get_type_values(registry.entity)
        names = set()
        for related in index.entities_by_ids(entity_ids):
            related = model.get_proxy(related)
            names.update(related.get_type_values(registry.name))

        if len(names) > 0:
            name_proxy = model.make_entity(proxy.schema)
            name_proxy.id = proxy.id
            name_proxy.add(prop, names)
            aggregator.put(name_proxy, fragment="names")

    index_aggregator(collection, aggregator, entity_ids=[entity_id])
    refresh_entity(collection, proxy.id)
Exemplo n.º 9
0
def upsert_entity(data,
                  collection,
                  authz=None,
                  sync=False,
                  sign=False,
                  job_id=None):
    """Create or update an entity in the database. This has a side effect  of migrating
    entities created via the _bulk API or a mapper to a database entity in the event
    that it gets edited by the user.
    """
    from aleph.logic.profiles import profile_fragments

    entity = None
    entity_id = collection.ns.sign(data.get("id"))
    if entity_id is not None:
        entity = Entity.by_id(entity_id, collection=collection)
    if entity is None:
        role_id = authz.id if authz is not None else None
        entity = Entity.create(data, collection, sign=sign, role_id=role_id)
    else:
        entity.update(data, collection, sign=sign)
    collection.touch()

    proxy = entity.to_proxy()
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=proxy.id)
    aggregator.put(proxy, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=proxy.id)

    index.index_proxy(collection, proxy, sync=sync)
    refresh_entity(collection, proxy.id)
    queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=proxy.id)
    return entity.id
Exemplo n.º 10
0
def load_mapping(collection, mapping_id, sync=False):
    """Flush and reload all entities generated by a mapping."""
    mapping = Mapping.by_id(mapping_id)
    if mapping is None:
        return log.error("Could not find mapping: %s", mapping_id)
    origin = mapping_origin(mapping.id)
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=origin)
    delete_entities(collection.id, origin=origin, sync=True)
    if mapping.disabled:
        return log.info("Mapping is disabled: %s", mapping_id)
    publish(
        Events.LOAD_MAPPING,
        params={
            "collection": collection,
            "table": mapping.table_id
        },
        channels=[collection, mapping.role],
        actor_id=mapping.role_id,
    )
    try:
        map_to_aggregator(collection, mapping, aggregator)
        aggregate_model(collection, aggregator)
        index_aggregator(collection, aggregator, sync=sync)
        mapping.set_status(status=Status.SUCCESS)
        db.session.commit()
    except Exception as exc:
        mapping.set_status(status=Status.FAILED, error=str(exc))
        db.session.commit()
        aggregator.delete(origin=origin)
    finally:
        aggregator.close()
Exemplo n.º 11
0
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    now = datetime.utcnow().isoformat()
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        if not is_mapping(data):
            raise InvalidData("Failed to read input data", errors=data)
        entity = model.get_proxy(data)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entity.context = {
            'role_id': role_id,
            'created_at': now,
            'updated_at': now,
        }
        writer.put(entity, origin='bulk')
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
Exemplo n.º 12
0
def _query_mentions(collection):
    aggregator = get_aggregator(collection, origin=ORIGIN)
    aggregator.delete(origin=ORIGIN)
    writer = aggregator.bulk()
    for proxy in _iter_mentions(collection):
        schemata = set()
        countries = set()
        for score, _, collection_id, match in _query_item(proxy):
            schemata.add(match.schema)
            countries.update(match.get_type_values(registry.country))
            yield score, proxy, collection_id, match
        if len(schemata):
            # Assign only those countries that are backed by one of
            # the matches:
            countries = countries.intersection(proxy.get("country"))
            proxy.set("country", countries)
            # Try to be more specific about schema:
            _merge_schemata(proxy, schemata)
            # Pick a principal name:
            proxy = name_entity(proxy)
            proxy.context["mutable"] = True
            log.debug("Reifying [%s]: %s", proxy.schema.name, proxy)
            writer.put(proxy, fragment="mention")
            # pprint(proxy.to_dict())
    writer.flush()
    aggregator.close()
Exemplo n.º 13
0
def flush_mapping(collection, mapping_id, sync=True):
    """Delete entities loaded by a mapping"""
    log.debug("Flushing entities for mapping: %s", mapping_id)
    origin = mapping_origin(mapping_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=origin)
    delete_entities(collection.id, origin=origin, sync=sync)
    update_collection(collection, sync=sync)
Exemplo n.º 14
0
def reingest_collection(collection, job_id=None, index=False):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=OP_ANALYZE)
    aggregator.delete(origin=OP_INGEST)
    aggregator.close()
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)
Exemplo n.º 15
0
def load_mapping(stage, collection, mapping_id):
    """Flush and reload all entities generated by a mapping."""
    mapping = Mapping.by_id(mapping_id)
    if mapping is None:
        return log.error("Could not find mapping: %s", mapping_id)
    flush_mapping(stage, collection, mapping_id)
    publish(Events.LOAD_MAPPING,
            params={'collection': collection, 'table': mapping.table_id},
            channels=[collection, mapping.role],
            actor_id=mapping.role_id)
    mapper = make_mapper(collection, mapping)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        entities_count = 0
        entity_ids = set()
        for idx, record in enumerate(mapper.source.records, 1):
            for entity in mapper.map(record).values():
                if entity.schema.is_a('Thing'):
                    entity.add('proof', mapping.table_id)
                entity = collection.ns.apply(entity)
                entity_ids.add(entity.id)
                entities_count += 1
                fragment = '%s-%s' % (mapping.id, idx)
                writer.put(entity, fragment=fragment)

            if idx > 0 and idx % 500 == 0:
                payload = {
                    'entity_ids': entity_ids,
                    'mapping_id': mapping.id
                }
                queue_task(collection, OP_INDEX,
                           job_id=stage.job.id,
                           payload=payload)
                entity_ids = set()
                stage.report_finished(500)
                log.info("[%s] Loaded %s records, %s entities...",
                         collection.foreign_id,
                         idx, entities_count)

        writer.flush()
        payload = {
            'entity_ids': entity_ids,
            'mapping_id': mapping.id
        }
        queue_task(collection, OP_INDEX,
                   job_id=stage.job.id,
                   payload=payload)
        mapping.set_status(status=Mapping.SUCCESS)
        log.info("[%s] Mapping done (%s entities)",
                 mapping.id, entities_count)
    except Exception as exc:
        mapping.set_status(status=Mapping.FAILED, error=str(exc))
    finally:
        aggregator.close()
Exemplo n.º 16
0
def index_aggregate(queue, collection, sync=False):
    """Project the contents of the collections aggregator into the index."""
    aggregator = get_aggregator(collection)
    try:
        index_entities(collection, aggregator, sync=sync)
        refresh_collection(collection.id, sync=sync)
        index_collection(collection, sync=sync)
        log.info("Aggregate indexed: %r", collection)
    finally:
        aggregator.close()
        queue.remove()
Exemplo n.º 17
0
def flush_mapping(stage, collection, mapping_id, sync=True):
    """Delete entities loaded by a mapping"""
    log.debug("Flushing entities for mapping: %s", mapping_id)
    origin = mapping_origin(mapping_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=origin)
    aggregator.close()
    delete_entities(collection.id, origin=origin, sync=sync)
    collection.touch()
    db.session.commit()
    update_collection(collection, sync=sync)
Exemplo n.º 18
0
def _fetch_entities(stage, collection, entity_id=None, batch=100):
    aggregator = get_aggregator(collection)
    if entity_id is not None:
        entity_id = ensure_list(entity_id)
        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        for task in stage.get_tasks(limit=batch):
            entity_id.append(task.payload.get('entity_id'))
        stage.mark_done(len(entity_id) - 1)

    yield from aggregator.iterate(entity_id=entity_id)
    aggregator.close()
Exemplo n.º 19
0
def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE):
    """Project the contents of the collections aggregator into the index."""
    if entity_ids is not None:
        entity_ids = ensure_list(entity_ids)
        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids)))
        for task in tasks:
            entity_ids.extend(ensure_list(task.payload.get("entity_ids")))
        stage.mark_done(len(tasks))
    aggregator = get_aggregator(collection)
    index_aggregator(collection, aggregator, entity_ids=entity_ids, sync=sync)
    refresh_collection(collection.id)
Exemplo n.º 20
0
def _fetch_entities(stage, collection, entity_ids=None, batch=100):
    aggregator = get_aggregator(collection)
    if entity_ids is not None:
        entity_ids = ensure_list(entity_ids)
        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids)))
        for task in tasks:
            entity_ids.extend(ensure_list(task.payload.get('entity_ids')))
        # FIXME: this doesn't retain mapping_id properly.
        stage.mark_done(len(tasks))

    yield from aggregator.iterate(entity_id=entity_ids)
    aggregator.close()
Exemplo n.º 21
0
def process_collection(stage, collection, ingest=True, sync=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    aggregator = get_aggregator(collection)
    for proxy in _collection_proxies(collection):
        if ingest and proxy.schema.is_a(Document.SCHEMA):
            ingest_entity(collection, proxy, job_id=stage.job.id, sync=sync)
        else:
            aggregator.put(proxy, fragment='db')
            queue_task(collection,
                       OP_INDEX,
                       job_id=stage.job.id,
                       payload={'entity_id': proxy.id},
                       context={'sync': sync})
    aggregator.close()
Exemplo n.º 22
0
    def load_fixtures(self):
        self.admin = self.create_user(foreign_id='admin', is_admin=True)
        self.private_coll = self.create_collection(foreign_id='test_private',
                                                   label="Private Collection",
                                                   category='grey',
                                                   casefile=False,
                                                   creator=self.admin)
        self._banana = Entity.create(
            {
                'schema': 'Person',
                'properties': {
                    'name': ['Banana'],
                }
            }, self.private_coll)
        user = Role.by_foreign_id(Role.SYSTEM_USER)
        Permission.grant(self.private_coll, user, True, False)
        self.public_coll = self.create_collection(foreign_id='test_public',
                                                  label="Public Collection",
                                                  category='news',
                                                  casefile=False,
                                                  creator=self.admin)
        self._kwazulu = Entity.create(
            {
                'schema': 'Company',
                'properties': {
                    'name': ['KwaZulu'],
                    'alias': ['kwazulu']
                }
            }, self.public_coll)
        visitor = Role.by_foreign_id(Role.SYSTEM_GUEST)
        Permission.grant(self.public_coll, visitor, True, False)
        db.session.commit()

        drop_aggregator(self.public_coll)
        stage = get_stage(self.public_coll, OP_PROCESS)
        process_collection(stage, self.public_coll, ingest=False, sync=True)

        aggregator = get_aggregator(self.private_coll)
        aggregator.delete()
        stage = get_stage(self.private_coll, OP_PROCESS)
        for sample in read_entities(self.get_fixture_path('samples.ijson')):
            aggregator.put(sample, fragment='sample')
            index_aggregate(stage,
                            self.private_coll,
                            entity_id=sample.id,
                            sync=True)
        aggregator.close()
        process_collection(stage, self.private_coll, ingest=False, sync=True)
Exemplo n.º 23
0
 def setUp(self):
     super(MappingAPITest, self).setUp()
     self.col = self.create_collection(foreign_id="map1")
     aggregator = get_aggregator(self.col)
     aggregator.delete()
     _, self.headers = self.login(is_admin=True)
     self.rolex = self.create_user(foreign_id="user_3")
     _, self.headers_x = self.login(foreign_id="user_3")
     self.fixture = self.get_fixture_path("experts.csv")
     self.content_hash = archive.archive_file(self.fixture)
     data = {
         "id": "foo",
         "schema": "Table",
         "properties": {
             "csvHash": self.content_hash,
             "contentHash": self.content_hash,
             "mimeType": "text/csv",
             "fileName": "experts.csv",
             "name": "experts.csv",
         },
     }
     self.ent = EntityProxy.from_dict(model, data, cleaned=False)
     self.ent.id = self.col.ns.sign(self.ent.id)
     index_proxy(self.col, self.ent)
     data = {
         "id": "foo2",
         "schema": "Table",
         "properties": {
             "csvHash": self.content_hash,
             "contentHash": self.content_hash,
             "mimeType": "text/csv",
             "fileName": "experts.csv",
             "name": "experts.csv",
         },
     }
     self.ent2 = EntityProxy.from_dict(model, data, cleaned=False)
     self.ent2.id = self.col.ns.sign(self.ent2.id)
     index_proxy(self.col, self.ent2)
     data = {
         "id": "bar",
         "schema": "LegalEntity",
         "properties": {
             "name": "John Doe"
         },
     }
     ent = EntityProxy.from_dict(model, data, cleaned=False)
     ent.id = self.col.ns.sign(ent.id)
     index_proxy(self.col, ent)
Exemplo n.º 24
0
def reindex_collection(collection, sync=False, flush=False):
    """Re-index all entities from the model, mappings and aggregator cache."""
    from aleph.logic.mapping import map_to_aggregator
    if flush:
        log.debug("[%s] Flushing...", collection)
        index.delete_entities(collection.id, sync=True)
    aggregator = get_aggregator(collection)
    for mapping in collection.mappings:
        try:
            map_to_aggregator(collection, mapping, aggregator)
        except Exception as ex:
            # More or less ignore broken models.
            log.warn("Failed mapping [%s]: %s", mapping.id, ex)
    aggregate_model(collection, aggregator)
    index_aggregator(collection, aggregator, sync=sync)
    compute_collection(collection, sync=True)
Exemplo n.º 25
0
def save_entityset_item(entityset, collection, entity_id, **data):
    """Change the association between an entity and an entityset. In the case of
    a profile, this may require re-indexing of the entity to update the associated
    profile_id.
    """
    item = EntitySetItem.save(entityset,
                              entity_id,
                              collection_id=collection.id,
                              **data)
    if entityset.type == EntitySet.PROFILE and entityset.collection_id == collection.id:
        from aleph.logic.profiles import profile_fragments

        aggregator = get_aggregator(collection)
        profile_fragments(collection, aggregator, entity_id=entity_id)
        index_aggregator(collection, aggregator, entity_ids=[entity_id])
        refresh_entity(collection, entity_id)
    refresh_entityset(entityset.id)
    return item
Exemplo n.º 26
0
def update_entity(collection, entity_id=None, job_id=None):
    """Worker post-processing for entity changes. This action collects operations
    that should be done after each change to an entity but are too slow to run
    inside the request cycle.

    Update xref and aggregator, trigger NER and re-index."""
    from aleph.logic.xref import xref_entity
    from aleph.logic.profiles import profile_fragments

    log.info("[%s] Update entity: %s", collection, entity_id)
    entity = index.get_entity(entity_id)
    proxy = model.get_proxy(entity)
    if collection.casefile:
        xref_entity(collection, proxy)

    aggregator = get_aggregator(collection, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=entity_id)
    inline_names(aggregator, proxy)
    pipeline_entity(collection, proxy, job_id=job_id)
Exemplo n.º 27
0
def _fetch_entities(stage, collection, entity_id=None, batch=50):
    aggregator = get_aggregator(collection)
    try:
        if entity_id is None:
            yield from aggregator
            return
        yield from aggregator.iterate(entity_id=entity_id)

        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        done = 0
        for task in stage.get_tasks(limit=batch):
            entity_id = task.payload.get('entity_id')
            for entity in aggregator.iterate(entity_id=entity_id):
                yield entity
                done += 1
        stage.mark_done(done)
    finally:
        aggregator.close()
Exemplo n.º 28
0
def process_collection(collection, ingest=True, reset=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    if reset:
        reset_collection(collection)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        for proxy in _collection_proxies(collection):
            writer.put(proxy, fragment='db')
            if ingest:
                ingest_entity(collection, proxy)
        writer.flush()
        if ingest:
            ingest_wait(collection)
        else:
            index_entities(collection, aggregator)
    finally:
        aggregator.close()
Exemplo n.º 29
0
def delete_collection(collection, keep_metadata=False, sync=False):
    cancel_queue(collection)
    aggregator = get_aggregator(collection)
    aggregator.drop()
    flush_notifications(collection, sync=sync)
    index.delete_entities(collection.id, sync=sync)
    xref_index.delete_xref(collection, sync=sync)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Mapping.delete_by_collection(collection.id)
    EntitySet.delete_by_collection(collection.id, deleted_at)
    Entity.delete_by_collection(collection.id)
    Document.delete_by_collection(collection.id)
    if not keep_metadata:
        Permission.delete_by_collection(collection.id)
        collection.delete(deleted_at=deleted_at)
    db.session.commit()
    if not keep_metadata:
        index.delete_collection(collection.id, sync=True)
        Authz.flush()
    refresh_collection(collection.id)
Exemplo n.º 30
0
def index_aggregate(stage, collection, entity_id=None, sync=False):
    """Project the contents of the collections aggregator into the index."""
    aggregator = get_aggregator(collection)
    try:
        entities = aggregator
        if entity_id is not None:
            entities = list(aggregator.iterate(entity_id=entity_id))

            # WEIRD: Instead of indexing a single entity, this will try
            # pull a whole batch of them off the queue and do it at once.
            for task in stage.get_tasks(limit=50):
                entity_id = task.payload.get('entity_id')
                entities.extend(aggregator.iterate(entity_id=entity_id))
            stage.mark_done(len(entities) - 1)

            for entity in entities:
                log.debug("Index: %r", entity)
                refresh_entity_id(entity.id)
        index_entities(stage, collection, entities, sync=sync)
    finally:
        aggregator.close()