def complete_export(export_id, file_path): export = Export.by_id(export_id) file_path = ensure_path(file_path) export.file_name = safe_filename(file_path) export.file_size = file_path.stat().st_size export.content_hash = checksum(file_path) try: archive.archive_file(file_path, content_hash=export.content_hash, mime_type=export.mime_type) export.set_status(status=Status.SUCCESS) except Exception: log.exception("Failed to upload export: %s", export) export.set_status(status=Status.FAILED) db.session.commit() params = {"export": export} role = Role.by_id(export.creator_id) log.info("Export [%r] complete: %s", export, export.status) publish( Events.COMPLETE_EXPORT, params=params, channels=[role], ) send_export_notification(export)
def export_entities(export_id, result): from aleph.logic import resolver export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) try: entities = [] stub = types.SimpleNamespace(result=result) for entity in result["results"]: resolver.queue(stub, Collection, entity.get("collection_id")) entities.append(model.get_proxy(entity)) resolver.resolve(stub) file_path = export_dir.joinpath("query-export.zip") zf = zipfile.ZipFile(file_path, "w") exporter = ExcelExporter(None, extra=EXTRA_HEADERS) for entity in entities: collection_id = entity.context.get("collection_id") collection = resolver.get(stub, Collection, collection_id) extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) content = exporter.get_bytesio().getvalue() zf.writestr("Export.xlsx", content) zf.close() complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Export.STATUS_FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("query-export.zip") with ZipFile(file_path, mode="w") as zf: excel_path = export_dir.joinpath(EXCEL_FILE) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for entity in iter_proxies(filters=filters): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= Export.MAX_FILE_SIZE: log.warn("Export too large: %r", export) break exporter.finalize() zf.write(excel_path, arcname=EXCEL_FILE) complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def __init__(self, stage, context): self.stage = stage self.context = context self.work_path = ensure_path(mkdtemp(prefix='ingestor-')) self.emitted = set() self._writer = None self._dataset = None
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) db.session.commit() proxy = document.to_proxy() ingest_entity(collection, proxy) finally: shutil.rmtree(upload_dir) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def set_filepath(self, file_path): file_path = ensure_path(file_path) file_name = safe_filename(file_path) file_size = file_path.stat().st_size self.file_name = file_name self.file_size = file_size self._file_path = file_path self.content_hash = checksum(file_path)
def __init__(self, dataset, stage, context): self.dataset = dataset self.writer = dataset.bulk() self.stage = stage self.context = context self.ns = Namespace(self.context.get("namespace")) self.work_path = ensure_path(mkdtemp(prefix="ingestor-")) self.emitted = set()
def _get_local_prefix(self, content_hash, temp_path=None): """Determine a temporary path for the file on the local file system.""" if temp_path is None: if not hasattr(self.local, 'dir'): self.local.dir = tempfile.mkdtemp(prefix=self.base_name) temp_path = self.local.dir return ensure_path(temp_path).joinpath('%s.sl' % content_hash)
def __init__(self, queue, context): self.queue = queue # TODO: Probably a good idea to make context readonly since we are # reusing it in child ingestors self.context = context self.work_path = ensure_path(mkdtemp(prefix='ingestor-')) self._emit_count = 0 self._writer = None self._dataset = None
def make_work_file(self, file_name, prefix=None): if prefix is not None: prefix = ensure_path(prefix) if self.manager.work_path not in prefix.parents: raise ProcessingException("Path escalation: %r" % prefix) prefix = prefix or self.manager.work_path work_file = prefix.joinpath(file_name) if prefix not in work_file.parents: raise ProcessingException("Path escalation: %r" % file_name) if not work_file.parent.exists(): work_file.parent.mkdir(parents=True, exist_ok=True) return work_file
def extract_hierarchy(self, entity, name): """Given a file path, create all its ancestor folders as entities""" foreign_id = pathlib.PurePath(entity.id) path = ensure_path(name) for name in path.as_posix().split("/")[:-1]: foreign_id = foreign_id.joinpath(name) if name in self.EXCLUDE: continue entity = self.manager.make_entity("Folder", parent=entity) entity.add("fileName", name) entity.make_id(foreign_id.as_posix()) self.manager.emit_entity(entity) return entity
def export_matches(export_id): """Export the top N matches of cross-referencing for the given collection to an Excel formatted export.""" export = Export.by_id(export_id) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) try: role = Role.by_id(export.creator_id) authz = Authz.from_role(role) collection = Collection.by_id(export.collection_id) file_name = "%s - Crossreference.xlsx" % collection.label file_path = export_dir.joinpath(file_name) excel = ExcelWriter() headers = [ "Score", "Entity Name", "Entity Date", "Entity Countries", "Candidate Collection", "Candidate Name", "Candidate Date", "Candidate Countries", "Entity Link", "Candidate Link", ] sheet = excel.make_sheet("Cross-reference", headers) batch = [] for match in iter_matches(collection, authz): batch.append(match) if len(batch) >= BULK_PAGE: _iter_match_batch(excel, sheet, batch) batch = [] if len(batch): _iter_match_batch(excel, sheet, batch) with open(file_path, "wb") as fp: buffer = excel.get_bytesio() for data in buffer: fp.write(data) complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def _ingest_path(db, conn, dataset, path, languages=[]): context = {'languages': languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def ingest(self, file_path, entity, **kwargs): """Main execution step of an ingestor.""" file_path = ensure_path(file_path) if file_path.is_file() and not entity.has("fileSize"): entity.add("fileSize", file_path.stat().st_size) entity.set("processingStatus", self.STATUS_FAILURE) try: ingestor_class = self.auction(file_path, entity) log.info("Ingestor [%r]: %s", entity, ingestor_class.__name__) self.delegate(ingestor_class, file_path, entity) entity.set("processingStatus", self.STATUS_SUCCESS) except ProcessingException as pexc: entity.set("processingError", stringify(pexc)) log.error("[%r] Failed to process: %s", entity, pexc) finally: self.finalize(entity)
def fixture(self, fixture_path): """Returns a fixture path and a dummy entity""" # clear out entities self.manager.entities = [] self.manager.dataset.delete() cur_path = ensure_path(__file__).parent cur_path = cur_path.joinpath('fixtures') path = cur_path.joinpath(fixture_path) entity = self.manager.make_entity('Document') if path.is_file(): checksum = self.manager.store(path) entity.make_id(path.name, checksum) entity.set('contentHash', checksum) entity.set('fileSize', path.stat().st_size) entity.set('fileName', path.name) else: entity.make_id(fixture_path) return path, entity
def _ingest_path(db, conn, dataset, path, languages=[]): context = {"languages": languages} job = Job.create(conn, dataset) stage = job.get_stage(OP_INGEST) manager = Manager(db, stage, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity("Document") checksum = manager.store(path) entity.set("contentHash", checksum) entity.make_id(checksum) entity.set("fileName", path.name) log.info("Queue: %r", entity.to_dict()) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def ingest(path, dataset, languages=None): """Queue a set of files for ingest.""" context = {'languages': languages} conn = get_redis() queue = ServiceQueue(conn, ServiceQueue.OP_INGEST, dataset) manager = Manager(queue, context) path = ensure_path(path) if path is not None: if path.is_file(): entity = manager.make_entity('Document') checksum = manager.store(path) entity.set('contentHash', checksum) entity.make_id(checksum) entity.set('fileName', path.name) manager.queue_entity(entity) if path.is_dir(): DirectoryIngestor.crawl(manager, path) manager.close()
def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("export.zip") with ZipFile(file_path, mode="w") as zf: excel_name = safe_filename(export.label, extension="xlsx") excel_path = export_dir.joinpath(excel_name) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for idx, entity in enumerate(iter_proxies(filters=filters)): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE: concern = "total size of the" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break if idx >= settings.EXPORT_MAX_RESULTS: concern = "number of" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break exporter.finalize() zf.write(excel_path, arcname=excel_name) file_name = "Export: %s" % export.label file_name = safe_filename(file_name, extension="zip") complete_export(export_id, file_path, file_name) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def fixture(self, fixture_path): """Returns a fixture path and a dummy entity""" # clear out entities self.manager.entities = [] self.manager.dataset.delete() cur_path = ensure_path(__file__).parent cur_path = cur_path.joinpath("fixtures") path = cur_path.joinpath(fixture_path) entity = self.manager.make_entity("Document") if not path.exists(): raise RuntimeError(path) if path.is_file(): checksum = self.manager.store(path) entity.make_id(path.name, checksum) entity.set("contentHash", checksum) entity.set("fileSize", path.stat().st_size) entity.set("fileName", path.name) else: entity.make_id(fixture_path) return path, entity
def archive_file(self, file_path, content_hash=None, mime_type=None): """Store the file located at the given path on S3, based on a path made up from its SHA1 content hash.""" file_path = ensure_path(file_path) if content_hash is None: content_hash = checksum(file_path) # if content_hash is None: # return obj = self._locate_key(content_hash) if obj is not None: return content_hash path = '{}/data'.format(self._get_prefix(content_hash)) extra = {} if mime_type is not None: extra['ContentType'] = mime_type with open(file_path, 'rb') as fh: self.client.upload_fileobj(fh, self.bucket, str(path), ExtraArgs=extra) return content_hash
def archive_file(self, file_path, content_hash=None, mime_type=None): """Store the file located at the given path on Google, based on a path made up from its SHA1 content hash.""" file_path = ensure_path(file_path) if content_hash is None: content_hash = checksum(file_path) if content_hash is None: return file_path = ensure_posix_path(file_path) for attempt in service_retries(): try: # blob = self._locate_contenthash(content_hash) # if blob is not None: # return content_hash path = os.path.join(path_prefix(content_hash), "data") blob = Blob(path, self.bucket) blob.upload_from_filename(file_path, content_type=mime_type) return content_hash except FAILURES: log.exception("Store error in GS") backoff(failures=attempt)
def setUp(self): self.mock = mock_s3() self.mock.start() self.archive = init_archive('s3', bucket='foo') self.file = ensure_path(__file__)
def setUp(self): tempdir = ensure_path(tempfile.gettempdir()) self.path = tempdir.joinpath('sltest') self.archive = init_archive('file', path=self.path) self.file = ensure_path(__file__)
def __init__(self, path=None): self.path = ensure_path(path) if self.path is None: raise ValueError('No archive path is set.') log.info("Archive: %s", self.path)
def ingest_upload(collection_id): """ --- post: summary: Upload a document to a collection description: Upload a document to a collection with id `collection_id` parameters: - in: path name: collection_id required: true schema: type: integer requestBody: content: multipart/form-data: schema: type: object properties: file: type: string format: binary description: The document to upload meta: $ref: '#/components/schemas/DocumentIngest' responses: '200': description: OK content: application/json: schema: properties: id: description: id of the uploaded document type: integer status: type: string type: object tags: - Ingest - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
def store(self, file_path, mime_type=None): file_path = ensure_path(file_path) mime_type = normalize_mimetype(mime_type) if file_path is not None and file_path.is_file(): return self.archive.archive_file(file_path, mime_type=mime_type)
def setUp(self): self.mock = mock_s3() self.mock.start() self.archive = init_archive("s3", bucket="foo", publication_bucket="foo") self.file = ensure_path(__file__)