def test_tiny_copy(self):
        test_bucket = infra.get_env("DSS_S3_BUCKET_TEST")
        test_src_key = infra.generate_test_key()
        src_data = os.urandom(1024)
        s3_blobstore = Config.get_blobstore_handle(Replica.aws)

        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()
            fh.seek(0)
            s3_blobstore.upload_file_handle(test_bucket, test_src_key, fh)

        src_etag = s3_blobstore.get_cloud_checksum(test_bucket, test_src_key)

        test_dst_key = infra.generate_test_key()
        state = s3copyclient.copy_sfn_event(test_bucket, test_src_key,
                                            test_bucket, test_dst_key)
        execution_id = str(uuid.uuid4())
        stepfunctions.step_functions_invoke("dss-s3-copy-sfn-{stage}",
                                            execution_id, state)

        self._check_dst_key_etag(test_bucket, test_dst_key, src_etag)
Exemplo n.º 2
0
def put(json_request_body: dict, replica: str, uuid: str, version: str):
    authenticated_user_email = security.get_token_email(request.token_info)
    collection_body = dict(json_request_body, owner=authenticated_user_email)
    uuid = uuid.lower()
    handle = Config.get_blobstore_handle(Replica[replica])
    collection_body["contents"] = _dedpuplicate_contents(
        collection_body["contents"])
    verify_collection(collection_body["contents"], Replica[replica], handle)
    collection_uuid = uuid if uuid else str(uuid4())
    collection_version = version
    # update dynamoDB; used to speed up lookup time; will not update if owner already associated w/uuid
    owner_lookup.put_collection(owner=authenticated_user_email,
                                collection_fqid=str(
                                    CollectionFQID(collection_uuid,
                                                   collection_version)))
    # add the collection file to the bucket
    handle.upload_file_handle(
        Replica[replica].bucket,
        CollectionFQID(collection_uuid, collection_version).to_key(),
        io.BytesIO(json.dumps(collection_body).encode("utf-8")))
    return jsonify(dict(uuid=collection_uuid,
                        version=collection_version)), requests.codes.created
Exemplo n.º 3
0
    def setUp(self, rounds=3):
        Config.set_config(BucketConfig.TEST)

        self.test_bucket = infra.get_env("DSS_GS_BUCKET_TEST")
        self.gs_blobstore = Config.get_blobstore_handle(Replica.gcp)
        test_src_keys = [infra.generate_test_key() for _ in range(rounds)]
        final_key = infra.generate_test_key()

        bucket_obj = self.gs_blobstore.gcp_client.bucket(self.test_bucket)

        self.gs_blobstore.upload_file_handle(
            self.test_bucket, test_src_keys[0],
            io.BytesIO(os.urandom(1024 * 1024)))

        for ix in range(len(test_src_keys) - 1):
            src_blob_obj = bucket_obj.get_blob(test_src_keys[ix])
            blobs = [src_blob_obj for _ in range(16)]
            dst_blob_obj = bucket_obj.blob(test_src_keys[ix + 1])

            dst_blob_obj.content_type = "application/octet-stream"
            dst_blob_obj.compose(blobs)

        # set the storage class to nearline.
        # NOTE: compose(…) does not seem to support setting a storage class.  The canonical way of changing storage
        # class is to call update_storage_class(…), but Google's libraries does not seem to handle
        # update_storage_class(…) calls for large objects.
        final_blob_obj = bucket_obj.blob(final_key)
        final_blob_obj.storage_class = "NEARLINE"
        final_blob_src = bucket_obj.get_blob(test_src_keys[-1])
        token = None
        while True:
            result = final_blob_obj.rewrite(final_blob_src, token=token)
            if result[0] is None:
                # done!
                break
            token = result[0]

        self.src_key = final_key
Exemplo n.º 4
0
 def _read_file_infos(cls, replica: Replica, fqid: BundleFQID,
                      manifest: JSON) -> List[Tuple[str, JSON]]:
     handle = Config.get_blobstore_handle(replica)
     index_files: List[Tuple[str, JSON]] = list()
     file_infos = manifest[BundleMetadata.FILES]
     assert isinstance(file_infos, list)
     for file_info in file_infos:
         if file_info[BundleFileMetadata.INDEXED]:
             file_name = file_info[BundleFileMetadata.NAME]
             content_type = file_info[BundleFileMetadata.CONTENT_TYPE]
             if content_type.startswith('application/json'):
                 file_blob_key = compose_blob_key(file_info)
                 try:
                     file_string = handle.get(replica.bucket,
                                              file_blob_key).decode("utf-8")
                 except BlobStoreError as ex:
                     raise RuntimeError(
                         f"{ex} This bundle will not be indexed. Bundle: {fqid}, File Blob Key: "
                         f"{file_blob_key}, File Name: '{file_name}'"
                     ) from ex
                 try:
                     file_json = json.loads(file_string)
                     # TODO (mbaumann) Are there other JSON-related exceptions that should be checked below?
                 except json.decoder.JSONDecodeError as ex:
                     logger.warning(
                         f"In bundle {fqid} the file '{file_name}' is marked for indexing yet could "
                         f"not be parsed. This file will not be indexed. Exception: {ex}"
                     )
                 else:
                     logger.debug(f"Loaded file: {file_name}")
                     index_files.append((file_name, file_json))
             else:
                 logger.warning(
                     f"In bundle {fqid} the file '{file_name}' is marked for indexing yet has "
                     f"content type '{content_type}' instead of the required content type "
                     f"'application/json'. This file will not be indexed.")
     return index_files
Exemplo n.º 5
0
def get_tombstoned_bundles(replica: Replica,
                           tombstone_key: str) -> typing.Iterator[str]:
    """
    Return the bundle fqid(s) associated with a versioned or unversioned tombstone, as verified on object storage.
    Note that an unversioned tombstone returns keys associated with bundles not previously, as show in the example
    below.

    bundles/uuid.version1
    bundles/uuid.version2
    bundles/uuid.version2.dead
    bundles/uuid.version3
    bundles/uuid.version3.dead
    bundles/uuid.dead

    For the above listing:
        `get_tombstoned_bundles(replica, bundles/uuid.version2.dead)` -> `[bundles/uuid.version2]`
        `get_tombstoned_bundles(replica, bundles/uuid.dead)` -> `[bundles/uuid.version1]`
    """
    handle = Config.get_blobstore_handle(replica)
    if DSS_VERSIONED_BUNDLE_TOMBSTONE_KEY_REGEX.match(tombstone_key):
        pfx = tombstone_key.split(f".{TOMBSTONE_SUFFIX}")[0]
        prev_key = ""
        for key in handle.list(replica.bucket, pfx):
            if key == f"{prev_key}.{TOMBSTONE_SUFFIX}":
                yield prev_key
            prev_key = key
    elif DSS_UNVERSIONED_BUNDLE_TOMBSTONE_KEY_REGEX.match(tombstone_key):
        pfx = tombstone_key.split(f".{TOMBSTONE_SUFFIX}")[0]
        prev_key = ""
        for key in handle.list(replica.bucket, pfx):
            if key != f"{prev_key}.{TOMBSTONE_SUFFIX}" and not prev_key.endswith(
                    TOMBSTONE_SUFFIX):
                if prev_key:
                    yield prev_key
            prev_key = key
    else:
        raise ValueError(f"{tombstone_key} is not a tombstone key")
Exemplo n.º 6
0
 def load(cls, replica: Replica, tombstone_id: BundleTombstoneID):
     blobstore = Config.get_blobstore_handle(replica)
     bucket_name = replica.bucket
     body = json.loads(blobstore.get(bucket_name, tombstone_id.to_key()))
     self = cls(replica, tombstone_id, body)
     return self
Exemplo n.º 7
0
def dependencies_exist(source_replica: Replica, dest_replica: Replica,
                       key: str):
    """
    Given a source replica and manifest key, checks if all dependencies of the corresponding DSS object are present in
    dest_replica:
     - Given a file manifest key, checks if blobs exist in dest_replica.
     - Given a bundle manifest key, checks if file manifests exist in dest_replica.
     - Given a collection key, checks if all collection contents exist in dest_replica.
    Returns true if all dependencies exist in dest_replica, false otherwise.
    """
    source_handle = Config.get_blobstore_handle(source_replica)
    dest_handle = Config.get_blobstore_handle(dest_replica)
    if key.endswith(TOMBSTONE_SUFFIX):
        return True
    elif key.startswith(FILE_PREFIX):
        file_id = FileFQID.from_key(key)
        file_manifest = get_json_metadata(
            entity_type="file",
            uuid=file_id.uuid,
            version=file_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        blob_path = compose_blob_key(file_manifest)
        if exists(dest_replica, blob_path):
            return True
    elif key.startswith(BUNDLE_PREFIX):
        # head all file manifests
        bundle_id = BundleFQID.from_key(key)
        bundle_manifest = get_json_metadata(
            entity_type="bundle",
            uuid=bundle_id.uuid,
            version=bundle_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            with ThreadPoolExecutor(max_workers=20) as e:
                futures = list()
                for file in bundle_manifest[BundleMetadata.FILES]:
                    file_uuid = file[BundleFileMetadata.UUID]
                    file_version = file[BundleFileMetadata.VERSION]
                    futures.append(
                        e.submit(get_json_metadata,
                                 entity_type="file",
                                 uuid=file_uuid,
                                 version=file_version,
                                 replica=dest_replica,
                                 blobstore_handle=source_handle,
                                 max_metadata_size=max_syncable_metadata_size))
                for future in as_completed(futures):
                    future.result()
            return True
        except Exception:
            pass
    elif key.startswith(COLLECTION_PREFIX):
        collection_id = CollectionFQID.from_key(key)
        collection_manifest = get_json_metadata(
            entity_type="collection",
            uuid=collection_id.uuid,
            version=collection_id.version,
            replica=source_replica,
            blobstore_handle=source_handle,
            max_metadata_size=max_syncable_metadata_size)
        try:
            verify_collection(contents=collection_manifest["contents"],
                              replica=dest_replica,
                              blobstore_handle=dest_handle)
            return True
        except Exception:
            pass
    else:
        raise NotImplementedError("Unknown prefix for key {}".format(key))
    return False
Exemplo n.º 8
0
def _list_prefix(replica: Replica, prefix: str):
    handle = Config.get_blobstore_handle(replica)
    return [object_key for object_key in handle.list(replica.bucket, prefix)]
Exemplo n.º 9
0
def get_bundle_from_bucket(uuid: str,
                           replica: Replica,
                           version: typing.Optional[str],
                           bucket: typing.Optional[str],
                           directurls: bool = False):
    uuid = uuid.lower()

    handle = Config.get_blobstore_handle(replica)
    default_bucket = replica.bucket

    # need the ability to use fixture bucket for testing
    bucket = default_bucket if bucket is None else bucket

    def tombstone_exists(uuid: str, version: typing.Optional[str]):
        return test_object_exists(
            handle, bucket,
            TombstoneID(uuid=uuid, version=version).to_key())

    # handle the following deletion cases
    # 1. the whole bundle is deleted
    # 2. the specific version of the bundle is deleted
    if tombstone_exists(uuid, None) or (version
                                        and tombstone_exists(uuid, version)):
        raise DSSException(404, "not_found", "EMPTY Cannot find file!")

    # handle the following deletion case
    # 3. no version is specified, we want the latest _non-deleted_ version
    if version is None:
        # list the files and find the one that is the most recent.
        prefix = f"bundles/{uuid}."
        object_names = handle.list(bucket, prefix)
        version = _latest_version_from_object_names(object_names)

    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    bundle_fqid = BundleFQID(uuid=uuid, version=version)

    # retrieve the bundle metadata.
    try:
        bundle_metadata = json.loads(
            handle.get(
                bucket,
                bundle_fqid.to_key(),
            ).decode("utf-8"))
    except BlobNotFoundError:
        raise DSSException(404, "not_found", "Cannot find file!")

    filesresponse = []  # type: typing.List[dict]
    for file in bundle_metadata[BundleMetadata.FILES]:
        file_version = {
            'name': file[BundleFileMetadata.NAME],
            'content-type': file[BundleFileMetadata.CONTENT_TYPE],
            'size': file[BundleFileMetadata.SIZE],
            'uuid': file[BundleFileMetadata.UUID],
            'version': file[BundleFileMetadata.VERSION],
            'crc32c': file[BundleFileMetadata.CRC32C],
            's3_etag': file[BundleFileMetadata.S3_ETAG],
            'sha1': file[BundleFileMetadata.SHA1],
            'sha256': file[BundleFileMetadata.SHA256],
            'indexed': file[BundleFileMetadata.INDEXED],
        }
        if directurls:
            file_version['url'] = str(UrlBuilder().set(
                scheme=replica.storage_schema,
                netloc=bucket,
                path="blobs/{}.{}.{}.{}".format(
                    file[BundleFileMetadata.SHA256],
                    file[BundleFileMetadata.SHA1],
                    file[BundleFileMetadata.S3_ETAG],
                    file[BundleFileMetadata.CRC32C],
                ),
            ))
        filesresponse.append(file_version)

    return dict(bundle=dict(
        uuid=uuid,
        version=version,
        files=filesresponse,
        creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID],
    ))
Exemplo n.º 10
0
 def __init__(self, argv: typing.List[str], args: argparse.Namespace):
     self.keys = args.keys.copy()
     self.replica = Replica[args.replica]
     self.handle = Config.get_blobstore_handle(self.replica)
     self.checkout_bucket = self.replica.checkout_bucket