def test_basic_archive(self): checksum_ = checksum(self.file) assert checksum_ is not None, checksum_ out = self.archive.archive_file(self.file) assert checksum_ == out, (checksum_, out) out2 = self.archive.archive_file(self.file) assert out == out2, (out, out2)
def complete_export(export_id, file_path): export = Export.by_id(export_id) file_path = ensure_path(file_path) export.file_name = safe_filename(file_path) export.file_size = file_path.stat().st_size export.content_hash = checksum(file_path) try: archive.archive_file(file_path, content_hash=export.content_hash, mime_type=export.mime_type) export.set_status(status=Status.SUCCESS) except Exception: log.exception("Failed to upload export: %s", export) export.set_status(status=Status.FAILED) db.session.commit() params = {"export": export} role = Role.by_id(export.creator_id) log.info("Export [%r] complete: %s", export, export.status) publish( Events.COMPLETE_EXPORT, params=params, channels=[role], ) send_export_notification(export)
def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) # Make sure collection counts are always accurate. update_document(document, shallow=True, sync=sync) return jsonify({'status': 'ok', 'id': stringify(document.id)}, status=201)
def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) if document.collection.casefile: # Make sure collection counts are always accurate. update_document(document, sync=sync) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def set_filepath(self, file_path): file_path = ensure_path(file_path) file_name = safe_filename(file_path) file_size = file_path.stat().st_size self.file_name = file_name self.file_size = file_size self._file_path = file_path self.content_hash = checksum(file_path)
def archive_file(self, file_path, content_hash=None, mime_type=None): """Import the given file into the archive.""" if content_hash is None: content_hash = checksum(file_path) if content_hash is None: return if self._locate_key(content_hash): return content_hash archive_prefix = self._get_prefix(content_hash) archive_path = self.path.joinpath(archive_prefix) archive_path.mkdir(parents=True, exist_ok=True) file_name = safe_filename(file_path, default='data') archive_path = archive_path.joinpath(file_name) with open(file_path, 'rb') as fin: with open(archive_path, 'wb') as fout: shutil.copyfileobj(fin, fout, BUF_SIZE) return content_hash
def archive_file(self, file_path, content_hash=None, mime_type=None): """Store the file located at the given path on S3, based on a path made up from its SHA1 content hash.""" file_path = ensure_path(file_path) if content_hash is None: content_hash = checksum(file_path) # if content_hash is None: # return obj = self._locate_key(content_hash) if obj is not None: return content_hash path = '{}/data'.format(self._get_prefix(content_hash)) extra = {} if mime_type is not None: extra['ContentType'] = mime_type with open(file_path, 'rb') as fh: self.client.upload_fileobj(fh, self.bucket, str(path), ExtraArgs=extra) return content_hash
def archive_file(self, file_path, content_hash=None, mime_type=None): """Store the file located at the given path on Google, based on a path made up from its SHA1 content hash.""" file_path = ensure_path(file_path) if content_hash is None: content_hash = checksum(file_path) if content_hash is None: return file_path = ensure_posix_path(file_path) for attempt in service_retries(): try: # blob = self._locate_contenthash(content_hash) # if blob is not None: # return content_hash path = os.path.join(path_prefix(content_hash), "data") blob = Blob(path, self.bucket) blob.upload_from_filename(file_path, content_type=mime_type) return content_hash except FAILURES: log.exception("Store error in GS") backoff(failures=attempt)