def test_tiny_copy(self): test_bucket = infra.get_env("DSS_S3_BUCKET_TEST") test_src_key = infra.generate_test_key() src_data = os.urandom(1024) s3_blobstore = Config.get_blobstore_handle(Replica.aws) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() fh.seek(0) s3_blobstore.upload_file_handle(test_bucket, test_src_key, fh) src_etag = s3_blobstore.get_cloud_checksum(test_bucket, test_src_key) test_dst_key = infra.generate_test_key() state = s3copyclient.copy_sfn_event(test_bucket, test_src_key, test_bucket, test_dst_key) execution_id = str(uuid.uuid4()) stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}", execution_id, state) self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
def put(json_request_body: dict, replica: str, uuid: str, version: str): authenticated_user_email = security.get_token_email(request.token_info) collection_body = dict(json_request_body, owner=authenticated_user_email) uuid = uuid.lower() handle = Config.get_blobstore_handle(Replica[replica]) collection_body["contents"] = _dedpuplicate_contents( collection_body["contents"]) verify_collection(collection_body["contents"], Replica[replica], handle) collection_uuid = uuid if uuid else str(uuid4()) collection_version = version # update dynamoDB; used to speed up lookup time; will not update if owner already associated w/uuid owner_lookup.put_collection(owner=authenticated_user_email, collection_fqid=str( CollectionFQID(collection_uuid, collection_version))) # add the collection file to the bucket handle.upload_file_handle( Replica[replica].bucket, CollectionFQID(collection_uuid, collection_version).to_key(), io.BytesIO(json.dumps(collection_body).encode("utf-8"))) return jsonify(dict(uuid=collection_uuid, version=collection_version)), requests.codes.created
def setUp(self, rounds=3): Config.set_config(BucketConfig.TEST) self.test_bucket = infra.get_env("DSS_GS_BUCKET_TEST") self.gs_blobstore = Config.get_blobstore_handle(Replica.gcp) test_src_keys = [infra.generate_test_key() for _ in range(rounds)] final_key = infra.generate_test_key() bucket_obj = self.gs_blobstore.gcp_client.bucket(self.test_bucket) self.gs_blobstore.upload_file_handle( self.test_bucket, test_src_keys[0], io.BytesIO(os.urandom(1024 * 1024))) for ix in range(len(test_src_keys) - 1): src_blob_obj = bucket_obj.get_blob(test_src_keys[ix]) blobs = [src_blob_obj for _ in range(16)] dst_blob_obj = bucket_obj.blob(test_src_keys[ix + 1]) dst_blob_obj.content_type = "application/octet-stream" dst_blob_obj.compose(blobs) # set the storage class to nearline. # NOTE: compose(…) does not seem to support setting a storage class. The canonical way of changing storage # class is to call update_storage_class(…), but Google's libraries does not seem to handle # update_storage_class(…) calls for large objects. final_blob_obj = bucket_obj.blob(final_key) final_blob_obj.storage_class = "NEARLINE" final_blob_src = bucket_obj.get_blob(test_src_keys[-1]) token = None while True: result = final_blob_obj.rewrite(final_blob_src, token=token) if result[0] is None: # done! break token = result[0] self.src_key = final_key
def _read_file_infos(cls, replica: Replica, fqid: BundleFQID, manifest: JSON) -> List[Tuple[str, JSON]]: handle = Config.get_blobstore_handle(replica) index_files: List[Tuple[str, JSON]] = list() file_infos = manifest[BundleMetadata.FILES] assert isinstance(file_infos, list) for file_info in file_infos: if file_info[BundleFileMetadata.INDEXED]: file_name = file_info[BundleFileMetadata.NAME] content_type = file_info[BundleFileMetadata.CONTENT_TYPE] if content_type.startswith('application/json'): file_blob_key = compose_blob_key(file_info) try: file_string = handle.get(replica.bucket, file_blob_key).decode("utf-8") except BlobStoreError as ex: raise RuntimeError( f"{ex} This bundle will not be indexed. Bundle: {fqid}, File Blob Key: " f"{file_blob_key}, File Name: '{file_name}'" ) from ex try: file_json = json.loads(file_string) # TODO (mbaumann) Are there other JSON-related exceptions that should be checked below? except json.decoder.JSONDecodeError as ex: logger.warning( f"In bundle {fqid} the file '{file_name}' is marked for indexing yet could " f"not be parsed. This file will not be indexed. Exception: {ex}" ) else: logger.debug(f"Loaded file: {file_name}") index_files.append((file_name, file_json)) else: logger.warning( f"In bundle {fqid} the file '{file_name}' is marked for indexing yet has " f"content type '{content_type}' instead of the required content type " f"'application/json'. This file will not be indexed.") return index_files
def get_tombstoned_bundles(replica: Replica, tombstone_key: str) -> typing.Iterator[str]: """ Return the bundle fqid(s) associated with a versioned or unversioned tombstone, as verified on object storage. Note that an unversioned tombstone returns keys associated with bundles not previously, as show in the example below. bundles/uuid.version1 bundles/uuid.version2 bundles/uuid.version2.dead bundles/uuid.version3 bundles/uuid.version3.dead bundles/uuid.dead For the above listing: `get_tombstoned_bundles(replica, bundles/uuid.version2.dead)` -> `[bundles/uuid.version2]` `get_tombstoned_bundles(replica, bundles/uuid.dead)` -> `[bundles/uuid.version1]` """ handle = Config.get_blobstore_handle(replica) if DSS_VERSIONED_BUNDLE_TOMBSTONE_KEY_REGEX.match(tombstone_key): pfx = tombstone_key.split(f".{TOMBSTONE_SUFFIX}")[0] prev_key = "" for key in handle.list(replica.bucket, pfx): if key == f"{prev_key}.{TOMBSTONE_SUFFIX}": yield prev_key prev_key = key elif DSS_UNVERSIONED_BUNDLE_TOMBSTONE_KEY_REGEX.match(tombstone_key): pfx = tombstone_key.split(f".{TOMBSTONE_SUFFIX}")[0] prev_key = "" for key in handle.list(replica.bucket, pfx): if key != f"{prev_key}.{TOMBSTONE_SUFFIX}" and not prev_key.endswith( TOMBSTONE_SUFFIX): if prev_key: yield prev_key prev_key = key else: raise ValueError(f"{tombstone_key} is not a tombstone key")
def load(cls, replica: Replica, tombstone_id: BundleTombstoneID): blobstore = Config.get_blobstore_handle(replica) bucket_name = replica.bucket body = json.loads(blobstore.get(bucket_name, tombstone_id.to_key())) self = cls(replica, tombstone_id, body) return self
def dependencies_exist(source_replica: Replica, dest_replica: Replica, key: str): """ Given a source replica and manifest key, checks if all dependencies of the corresponding DSS object are present in dest_replica: - Given a file manifest key, checks if blobs exist in dest_replica. - Given a bundle manifest key, checks if file manifests exist in dest_replica. - Given a collection key, checks if all collection contents exist in dest_replica. Returns true if all dependencies exist in dest_replica, false otherwise. """ source_handle = Config.get_blobstore_handle(source_replica) dest_handle = Config.get_blobstore_handle(dest_replica) if key.endswith(TOMBSTONE_SUFFIX): return True elif key.startswith(FILE_PREFIX): file_id = FileFQID.from_key(key) file_manifest = get_json_metadata( entity_type="file", uuid=file_id.uuid, version=file_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) blob_path = compose_blob_key(file_manifest) if exists(dest_replica, blob_path): return True elif key.startswith(BUNDLE_PREFIX): # head all file manifests bundle_id = BundleFQID.from_key(key) bundle_manifest = get_json_metadata( entity_type="bundle", uuid=bundle_id.uuid, version=bundle_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) try: with ThreadPoolExecutor(max_workers=20) as e: futures = list() for file in bundle_manifest[BundleMetadata.FILES]: file_uuid = file[BundleFileMetadata.UUID] file_version = file[BundleFileMetadata.VERSION] futures.append( e.submit(get_json_metadata, entity_type="file", uuid=file_uuid, version=file_version, replica=dest_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size)) for future in as_completed(futures): future.result() return True except Exception: pass elif key.startswith(COLLECTION_PREFIX): collection_id = CollectionFQID.from_key(key) collection_manifest = get_json_metadata( entity_type="collection", uuid=collection_id.uuid, version=collection_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) try: verify_collection(contents=collection_manifest["contents"], replica=dest_replica, blobstore_handle=dest_handle) return True except Exception: pass else: raise NotImplementedError("Unknown prefix for key {}".format(key)) return False
def _list_prefix(replica: Replica, prefix: str): handle = Config.get_blobstore_handle(replica) return [object_key for object_key in handle.list(replica.bucket, prefix)]
def get_bundle_from_bucket(uuid: str, replica: Replica, version: typing.Optional[str], bucket: typing.Optional[str], directurls: bool = False): uuid = uuid.lower() handle = Config.get_blobstore_handle(replica) default_bucket = replica.bucket # need the ability to use fixture bucket for testing bucket = default_bucket if bucket is None else bucket def tombstone_exists(uuid: str, version: typing.Optional[str]): return test_object_exists( handle, bucket, TombstoneID(uuid=uuid, version=version).to_key()) # handle the following deletion cases # 1. the whole bundle is deleted # 2. the specific version of the bundle is deleted if tombstone_exists(uuid, None) or (version and tombstone_exists(uuid, version)): raise DSSException(404, "not_found", "EMPTY Cannot find file!") # handle the following deletion case # 3. no version is specified, we want the latest _non-deleted_ version if version is None: # list the files and find the one that is the most recent. prefix = f"bundles/{uuid}." object_names = handle.list(bucket, prefix) version = _latest_version_from_object_names(object_names) if version is None: # no matches! raise DSSException(404, "not_found", "Cannot find file!") bundle_fqid = BundleFQID(uuid=uuid, version=version) # retrieve the bundle metadata. try: bundle_metadata = json.loads( handle.get( bucket, bundle_fqid.to_key(), ).decode("utf-8")) except BlobNotFoundError: raise DSSException(404, "not_found", "Cannot find file!") filesresponse = [] # type: typing.List[dict] for file in bundle_metadata[BundleMetadata.FILES]: file_version = { 'name': file[BundleFileMetadata.NAME], 'content-type': file[BundleFileMetadata.CONTENT_TYPE], 'size': file[BundleFileMetadata.SIZE], 'uuid': file[BundleFileMetadata.UUID], 'version': file[BundleFileMetadata.VERSION], 'crc32c': file[BundleFileMetadata.CRC32C], 's3_etag': file[BundleFileMetadata.S3_ETAG], 'sha1': file[BundleFileMetadata.SHA1], 'sha256': file[BundleFileMetadata.SHA256], 'indexed': file[BundleFileMetadata.INDEXED], } if directurls: file_version['url'] = str(UrlBuilder().set( scheme=replica.storage_schema, netloc=bucket, path="blobs/{}.{}.{}.{}".format( file[BundleFileMetadata.SHA256], file[BundleFileMetadata.SHA1], file[BundleFileMetadata.S3_ETAG], file[BundleFileMetadata.CRC32C], ), )) filesresponse.append(file_version) return dict(bundle=dict( uuid=uuid, version=version, files=filesresponse, creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID], ))
def __init__(self, argv: typing.List[str], args: argparse.Namespace): self.keys = args.keys.copy() self.replica = Replica[args.replica] self.handle = Config.get_blobstore_handle(self.replica) self.checkout_bucket = self.replica.checkout_bucket