def process_collection(stage, collection, ingest=True, reset=False, sync=False): """Trigger a full re-parse of all documents and re-build the search index from the aggregator.""" ingest = ingest or reset if reset: reset_collection(collection, sync=True) aggregator = get_aggregator(collection) try: writer = aggregator.bulk() for proxy in _collection_proxies(collection): writer.put(proxy, fragment='db') stage.report_finished(1) writer.flush() if ingest: for proxy in aggregator: ingest_entity(collection, proxy, job_id=stage.job.id) else: queue_task(collection, OP_INDEX, job_id=stage.job.id, context={'sync': sync}) finally: aggregator.close()
def generate(collection_id): data = parse_request(XrefSchema) collection = get_db_collection(collection_id, request.authz.WRITE) against = ensure_list(data.get("against_collection_ids")) payload = {'against_collection_ids': against} queue_task(collection, OP_XREF, payload=payload) return jsonify({'status': 'accepted'}, status=202)
def export(collection_id): """ --- post: summary: Download cross-reference results description: Download results of cross-referencing as an Excel file parameters: - in: path name: collection_id required: true schema: type: integer responses: '202': description: Accepted tags: - Xref - Collection """ collection = get_db_collection(collection_id, request.authz.READ) label = "%s - Crossreference results" % collection.label export = create_export( operation=OP_EXPORT_XREF_RESULTS, role_id=request.authz.id, label=label, collection=collection, mime_type=XLSX, ) job_id = get_session_id() payload = { "collection_id": collection_id, "export_id": export.id, } queue_task(None, OP_EXPORT_XREF_RESULTS, job_id=job_id, payload=payload) return ("", 202)
def flush(collection_id, mapping_id): """Flush all entities loaded by mapping with id `mapping_id`. --- post: summary: Flush entities loaded by a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) queue_task(collection, OP_FLUSH_MAPPING, job_id=get_session_id(), payload={'mapping_id': mapping.id}) return ('', 202)
def generate(collection_id): """ --- post: summary: Generate cross-reference matches description: > Generate cross-reference matches for entities in a collection. parameters: - in: path name: collection_id required: true schema: type: integer responses: '202': content: application/json: schema: properties: status: description: accepted type: string type: object description: Accepted tags: - Xref - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) queue_task(collection, OP_XREF) return jsonify({"status": "accepted"}, status=202)
def reingest(collection_id): """ --- post: summary: Re-ingest a collection description: > Trigger a process to re-parse the content of all documents stored in the collection with id `collection_id`. parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - in: query name: index description: Index documents while they're being processed. schema: type: boolean responses: '202': description: Accepted tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() data = {"index": get_flag("index", False)} queue_task(collection, OP_REINGEST, job_id=job_id, payload=data) return ("", 202)
def reindex(collection_id): """ --- post: summary: Re-index a collection description: > Re-index the entities in the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - in: query description: Delete the index before re-generating it. name: flush schema: type: boolean responses: '202': description: Accepted tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() data = {"flush": get_flag("flush", False)} queue_task(collection, OP_REINDEX, job_id=job_id, payload=data) return ("", 202)
def export(): """ --- post: summary: Download the results of a search description: >- Downloads all the results of a search as a zip archive; upto a max of 10,000 results. The returned file will contain an Excel document with structured data as well as the binary files from all matching documents. Supports the same query parameters as the search API. responses: '202': description: Accepted tags: - Entity """ require(request.authz.logged_in) parser = SearchQueryParser(request.args, request.authz) tag_request(query=parser.text, prefix=parser.prefix) query = EntitiesQuery(parser) label = gettext("Search: %s") % query.to_text() export = create_export( operation=OP_EXPORT_SEARCH, role_id=request.authz.id, label=label, mime_type=ZIP, meta={"query": query.get_full_query()}, ) job_id = get_session_id() queue_task(None, OP_EXPORT_SEARCH, job_id=job_id, export_id=export.id) return ("", 202)
def upsert_entity(data, collection, authz=None, sync=False, sign=False, job_id=None): """Create or update an entity in the database. This has a side effect of migrating entities created via the _bulk API or a mapper to a database entity in the event that it gets edited by the user. """ from aleph.logic.profiles import profile_fragments entity = None entity_id = collection.ns.sign(data.get("id")) if entity_id is not None: entity = Entity.by_id(entity_id, collection=collection) if entity is None: role_id = authz.id if authz is not None else None entity = Entity.create(data, collection, sign=sign, role_id=role_id) else: entity.update(data, collection, sign=sign) collection.touch() proxy = entity.to_proxy() aggregator = get_aggregator(collection) aggregator.delete(entity_id=proxy.id) aggregator.put(proxy, origin=MODEL_ORIGIN) profile_fragments(collection, aggregator, entity_id=proxy.id) index.index_proxy(collection, proxy, sync=sync) refresh_entity(collection, proxy.id) queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=proxy.id) return entity.id
def xref_collection(stage, collection): """Cross-reference all the entities and documents in a collection.""" index.delete_xref(collection) matchable = [s.name for s in model if s.matchable] entities = iter_entities(collection_id=collection.id, schemata=matchable) for entity in entities: queue_task(collection, OP_XREF_ITEM, job_id=stage.job.id, payload={'entity_id': entity.get('id')})
def item_update(entityset_id): """Add an item to the entity set with id `entityset_id`, or change the items judgement. To delete an item from the entity set, apply the judgement: `no_judgement`. --- post: summary: Add item to an entityset parameters: - description: The entityset id. in: path name: entityset_id required: true schema: type: string example: 3a0d91ece2dce88ad3259594c7b642485235a048 requestBody: content: application/json: schema: $ref: '#/components/schemas/EntitySetItemUpdate' responses: '200': content: application/json: schema: $ref: '#/components/schemas/EntitySetItem' description: OK '204': description: Item removed tags: - EntitySetItem """ entityset = get_entityset(entityset_id, request.authz.WRITE) data = parse_request("EntitySetItemUpdate") entity = data.pop("entity", {}) entity_id = data.pop("entity_id", entity.get("id")) entity = get_index_entity(entity_id, request.authz.READ) collection = get_db_collection(entity["collection_id"]) data["added_by_id"] = request.authz.id data.pop("collection", None) item = save_entityset_item(entityset, collection, entity_id, **data) db.session.commit() job_id = get_session_id() queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=entity_id) if item is not None: # The entityset is needed to check if the item is writeable in the serializer: item = item.to_dict(entityset=entityset) else: item = { "id": "$".join((entityset_id, entity_id)), "entityset_id": entityset_id, "entityset_collection_id": entityset.collection_id, "entity_id": entity_id, "collection_id": entity["collection_id"], "judgement": Judgement.NO_JUDGEMENT, } return EntitySetItemSerializer.jsonify(item)
def process(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) # re-process the documents payload = { 'ingest': get_flag('ingest', True), 'reset': get_flag('reset', True) } queue_task(collection, OP_PROCESS, payload=payload) return ('', 202)
def process(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) # re-process the documents data = {'reset': get_flag('reset', True)} queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def bulkload(file_name): """Load entities from the specified mapping file.""" log.info("Loading bulk data from: %s", file_name) config = load_mapping_file(file_name) for foreign_id, data in config.items(): data['foreign_id'] = foreign_id data['label'] = data.get('label', foreign_id) create_collection(data) collection = Collection.by_foreign_id(foreign_id) queue_task(collection, OP_BULKLOAD, payload=data)
def load_mapping(stage, collection, mapping_id): """Flush and reload all entities generated by a mapping.""" mapping = Mapping.by_id(mapping_id) if mapping is None: return log.error("Could not find mapping: %s", mapping_id) flush_mapping(stage, collection, mapping_id) publish(Events.LOAD_MAPPING, params={'collection': collection, 'table': mapping.table_id}, channels=[collection, mapping.role], actor_id=mapping.role_id) mapper = make_mapper(collection, mapping) aggregator = get_aggregator(collection) try: writer = aggregator.bulk() entities_count = 0 entity_ids = set() for idx, record in enumerate(mapper.source.records, 1): for entity in mapper.map(record).values(): if entity.schema.is_a('Thing'): entity.add('proof', mapping.table_id) entity = collection.ns.apply(entity) entity_ids.add(entity.id) entities_count += 1 fragment = '%s-%s' % (mapping.id, idx) writer.put(entity, fragment=fragment) if idx > 0 and idx % 500 == 0: payload = { 'entity_ids': entity_ids, 'mapping_id': mapping.id } queue_task(collection, OP_INDEX, job_id=stage.job.id, payload=payload) entity_ids = set() stage.report_finished(500) log.info("[%s] Loaded %s records, %s entities...", collection.foreign_id, idx, entities_count) writer.flush() payload = { 'entity_ids': entity_ids, 'mapping_id': mapping.id } queue_task(collection, OP_INDEX, job_id=stage.job.id, payload=payload) mapping.set_status(status=Mapping.SUCCESS) log.info("[%s] Mapping done (%s entities)", mapping.id, entities_count) except Exception as exc: mapping.set_status(status=Mapping.FAILED, error=str(exc)) finally: aggregator.close()
def bulk_load(queue, collection, config): """Bulk load entities from a CSV file or SQL database. This is done by mapping the rows in the source data to entities and links which can be understood by the entity index. """ queries = keys_values(config, 'queries', 'query') for query in queries: bulk_load_query(queue, collection, hash_data(query), query) queue_task(collection, OP_INDEX) queue.remove()
def pairwise(): """ --- post: summary: Make a pairwise judgement between an entity and a match. description: > This lets a user decide if they think a given xref match is a true or false match. Implicitly, this might create or alter a profile in the collection used by requestBody: content: application/json: schema: $ref: '#/components/schemas/Pairwise' responses: '200': content: application/json: schema: properties: status: description: accepted type: string profile_id: description: profile_id for `entity`. type: string type: object description: Accepted tags: - Profile """ data = parse_request("Pairwise") entity = get_index_entity(data.get("entity_id")) collection = get_db_collection(entity["collection_id"], request.authz.WRITE) match = get_index_entity(data.get("match_id")) match_collection = get_db_collection(match["collection_id"]) profile = decide_pairwise( collection, entity, match_collection, match, judgement=data.get("judgement"), authz=request.authz, ) job_id = get_session_id() queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=entity.get("id")) profile_id = profile.id if profile is not None else None return jsonify({"status": "ok", "profile_id": profile_id}, status=200)
def mapping(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) except InvalidMapping as invalid: raise BadRequest(invalid) queue_task(collection, OP_BULKLOAD, payload=data) return ('', 202)
def xref_collection(stage, collection, against_collection_ids=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_entities(collection_id=collection.id, schemata=matchable) for entity in entities: payload = { 'entity_id': entity.get('id'), 'against_collection_ids': against_collection_ids } queue_task(collection, OP_XREF_ITEM, job_id=stage.job.id, payload=payload)
def process_collection(stage, collection, ingest=True, sync=False): """Trigger a full re-parse of all documents and re-build the search index from the aggregator.""" aggregator = get_aggregator(collection) for proxy in _collection_proxies(collection): if ingest and proxy.schema.is_a(Document.SCHEMA): ingest_entity(collection, proxy, job_id=stage.job.id, sync=sync) else: aggregator.put(proxy, fragment='db') queue_task(collection, OP_INDEX, job_id=stage.job.id, payload={'entity_id': proxy.id}, context={'sync': sync}) aggregator.close()
def update(foreign_id=None, index=False, process=False, reset=False): """Re-index all the collections and entities.""" update_roles() q = Collection.all(deleted=True) if foreign_id is not None: q = [get_collection(foreign_id)] for collection in q: if reset: reset_collection(collection, sync=True) refresh_collection(collection.id) index_collection(collection) if collection.deleted_at is not None: continue if index or process: payload = {'ingest': process} queue_task(collection, OP_PROCESS, payload=payload)
def flush(collection_id, mapping_id): """Flush all entities loaded by mapping with id `mapping_id`. --- post: summary: Flush entities loaded by a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) mapping.disabled = True mapping.last_run_status = None mapping.last_run_err_msg = None db.session.add(mapping) db.session.commit() queue_task( collection, OP_FLUSH_MAPPING, job_id=get_session_id(), mapping_id=mapping_id, ) return ("", 202)
def trigger(collection_id, mapping_id): """Load entities by running the mapping with id `mapping_id`. Flushes previously loaded entities before loading new entities. --- post: summary: Load entities from a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) mapping.disabled = False mapping.set_status(Status.PENDING) db.session.commit() job_id = get_session_id() queue_task(collection, OP_LOAD_MAPPING, job_id=job_id, mapping_id=mapping.id) mapping = obj_or_404(Mapping.by_id(mapping_id)) return MappingSerializer.jsonify(mapping, status=202)
def export(): """ --- post: summary: Download the results of a search description: >- Downloads all the results of a search as a zip archive; upto a max of 10,000 results. The returned file will contain an Excel document with structured data as well as the binary files from all matching documents. Supports the same query parameters as the search API. responses: '202': description: Accepted tags: - Entity """ require(request.authz.logged_in) parser = SearchQueryParser(request.args, request.authz) parser.limit = MAX_PAGE tag_request(query=parser.text, prefix=parser.prefix) result = EntitiesQuery.handle(request, parser=parser) label = "Search results for query: %s" % parser.text export = create_export( operation=OP_EXPORT_SEARCH_RESULTS, role_id=request.authz.id, label=label, file_path=None, expires_after=Export.DEFAULT_EXPIRATION, collection=None, mime_type=ZIP, ) job_id = get_session_id() payload = { "export_id": export.id, "result": result.to_dict(), } queue_task(None, OP_EXPORT_SEARCH_RESULTS, job_id=job_id, payload=payload) return ("", 202)
def delete(collection_id, mapping_id): """Delete a mapping. --- delete: summary: Delete a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '204': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) mapping.delete() db.session.commit() queue_task( collection, OP_FLUSH_MAPPING, job_id=get_session_id(), mapping_id=mapping_id, ) return ("", 204)
def generate(collection_id): """ --- post: summary: Generate cross-reference matches description: > Generate cross-reference matches for entities in a collection. parameters: - in: path name: collection_id required: true schema: type: integer requestBody: content: application/json: schema: $ref: '#/components/schemas/XrefGenerate' responses: '202': content: application/json: schema: properties: status: description: accepted type: string type: object description: Accepted tags: - Xref - Collection """ data = parse_request('XrefGenerate') collection = get_db_collection(collection_id, request.authz.WRITE) against = ensure_list(data.get("against_collection_ids")) payload = {'against_collection_ids': against} queue_task(collection, OP_XREF, payload=payload) return jsonify({'status': 'accepted'}, status=202)
def trigger(collection_id, mapping_id): """Load entities by running the mapping with id `mapping_id`. Flushes previously loaded entities before loading new entities. --- post: summary: Load entities from a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) job_id = get_session_id() payload = {'mapping_id': mapping.id} queue_task(collection, OP_LOAD_MAPPING, job_id=job_id, payload=payload) collection.touch() db.session.commit() return ('', 202)
def process(collection_id): """ --- post: summary: Process a collection description: Start processing the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - in: query name: ingest schema: type: boolean - in: query name: reset schema: type: boolean responses: '202': description: Accepted tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) # re-process the documents data = {'reset': get_flag('reset', True)} queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def xref(foreign_id, against=None): """Cross-reference all entities and documents in a collection.""" collection = get_collection(foreign_id) against = [get_collection(c).id for c in ensure_list(against)] against = {'against_collection_ids': against} queue_task(collection, OP_XREF, payload=against)
def retry_exports(): for export in Export.get_pending(): queue_task(None, export.operation, payload={"export_id": export.id})