def crawl_directory(collection, path, parent=None, job_id=None): """Crawl the contents of the given path.""" try: content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) # if the job_id is not set yet and path.is_dir(), we know it is the # first iteration and we don't create an initial root folder as parent # to be consistent with the behaviour of alephclient if path.is_dir() and job_id is None: document = None job_id = Job.random_id() else: meta = {"file_name": path.name} document = Document.save( collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, ) db.session.commit() job_id = job_id or Job.random_id() proxy = document.to_proxy() ingest_flush(collection, entity_id=proxy.id) ingest_entity(collection, proxy, job_id=job_id) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document, job_id) except OSError: log.exception("Cannot crawl directory: %s", path)
def crawl_directory(collection, path, parent=None, job_id=None): """Crawl the contents of the given path.""" try: content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) meta = {'file_name': path.name} document = Document.save(collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta) db.session.commit() job_id = job_id or Job.random_id() ingest_entity(collection, document.to_proxy(), job_id=job_id) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document, job_id) except OSError: log.exception("Cannot crawl directory: %s", path)
def ingest_upload(collection_id): """ --- post: summary: Upload a document to a collection description: Upload a document to a collection with id `collection_id` parameters: - in: path name: collection_id required: true schema: type: integer requestBody: content: multipart/form-data: schema: type: object properties: file: type: string format: binary description: The document to upload meta: $ref: '#/components/schemas/DocumentIngest' responses: '200': description: OK content: application/json: schema: properties: id: description: id of the uploaded document type: integer status: type: string type: object tags: - Ingest - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
# execute in the aleph shell: # exec(open("reingest_partial.py").read()) from servicelayer.jobs import Job from aleph.queues import ingest_entity from aleph.model import Collection, Document job_id = Job.random_id() collection = Collection.by_id(125) with open('collection_ftm_failures.csv', 'r') as f: for document_id in f: print("reingest " + document_id) document = Document.by_id(document_id) proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=True)