示例#1
0
def touch_test_file(replica: Replica, dst_bucket: str) -> bool:
    """
    Write a test file into the specified bucket.
    :param dst_bucket: the bucket to be checked.
    :param replica: the replica to execute the checkout in.
    :return: True if able to write, if not raise DestinationBucketNotWritableError.
    """
    randomizer = ''.join(choices(hexdigits, k=2))
    # Spreading the touch test file across a larger range prevents hitting a modification rate limits.
    test_object = f"touch/{randomizer}.txt"
    handle = Config.get_blobstore_handle(replica)

    try:
        handle.upload_file_handle(
            dst_bucket,
            test_object,
            io.BytesIO(b""))
        return True
    except Exception as ex:
        raise DestinationBucketNotWritableError(ex)
    finally:
        try:
            Config.get_blobstore_handle(replica).delete(dst_bucket, test_object)
        except Exception:
            pass
示例#2
0
def validate_file_dst(dst_bucket: str, dst_key: str, replica: Replica):
    try:
        Config.get_blobstore_handle(replica).get_user_metadata(
            dst_bucket, dst_key)
        return True
    except (BlobNotFoundError, BlobStoreUnknownError):
        return False
示例#3
0
def touch_test_file(dst_bucket: str, replica: Replica) -> bool:
    """
    Write a test file into the specified bucket.
    :param bucket: the bucket to be checked.
    :return: True if able to write, if not also returns error message as a cause
    """
    test_object = "touch.txt"
    handle = Config.get_blobstore_handle(replica)

    try:
        handle.upload_file_handle(dst_bucket, test_object, io.BytesIO(b""))
        Config.get_blobstore_handle(replica).delete(dst_bucket, test_object)
        return True
    except Exception as e:
        return False
示例#4
0
def get_helper(uuid: str, replica: Replica, version: str = None):
    handle = Config.get_blobstore_handle(replica)
    bucket = replica.bucket

    if version is None:
        # list the files and find the one that is the most recent.
        prefix = "files/{}.".format(uuid)
        for matching_file in handle.list(bucket, prefix):
            matching_file = matching_file[len(prefix):]
            if version is None or matching_file > version:
                version = matching_file

    if version is None:
        # no matches!
        raise DSSException(404, "not_found", "Cannot find file!")

    # retrieve the file metadata.
    try:
        file_metadata = json.loads(
            handle.get(bucket, "files/{}.{}".format(uuid,
                                                    version)).decode("utf-8"))
    except BlobNotFoundError as ex:
        raise DSSException(404, "not_found", "Cannot find file!")

    blob_path = "blobs/" + ".".join((
        file_metadata[FileMetadata.SHA256],
        file_metadata[FileMetadata.SHA1],
        file_metadata[FileMetadata.S3_ETAG],
        file_metadata[FileMetadata.CRC32C],
    ))

    if request.method == "GET":
        """
        Probabilistically return "Retry-After" header
        The retry-after interval can be relatively short now, but it sets up downstream
        libraries / users for success when we start integrating this with the checkout service.
        """
        if random.randint(0, 100) < REDIRECT_PROBABILITY_PERCENTS:
            response = redirect(request.url, code=301)
            headers = response.headers
            headers['Retry-After'] = RETRY_AFTER_INTERVAL
            return response

        response = redirect(
            handle.generate_presigned_GET_url(bucket, blob_path))
    else:
        response = make_response('', 200)

    headers = response.headers
    headers['X-DSS-BUNDLE-UUID'] = file_metadata[FileMetadata.BUNDLE_UUID]
    headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID]
    headers['X-DSS-VERSION'] = version
    headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE]
    headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE]
    headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C]
    headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG]
    headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1]
    headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256]

    return response
    def _test_bundle_get_directaccess(self, replica: Replica):
        schema = replica.storage_schema

        bundle_uuid = "011c7340-9b3c-4d62-bf49-090d79daf198"
        version = "2017-06-20T214506.766634Z"

        url = str(UrlBuilder()
                  .set(path="/v1/bundles/" + bundle_uuid)
                  .add_query("replica", replica.name)
                  .add_query("version", version)
                  .add_query("directurls", "true"))

        with override_bucket_config(BucketConfig.TEST_FIXTURE):
            resp_obj = self.assertGetResponse(
                url,
                requests.codes.ok)

        url = resp_obj.json['bundle']['files'][0]['url']
        splitted = urllib.parse.urlparse(url)
        self.assertEqual(splitted.scheme, schema)
        bucket = splitted.netloc
        key = splitted.path[1:]  # ignore the / part of the path.

        handle = Config.get_blobstore_handle(replica)
        contents = handle.get(bucket, key)

        hasher = hashlib.sha1()
        hasher.update(contents)
        sha1 = hasher.hexdigest()
        self.assertEqual(sha1, "2b8b815229aa8a61e483fb4ba0588b8b6c491890")
示例#6
0
def record(argv: typing.List[str], args: argparse.Namespace):
    """
    Record events for `keys` into flashflood prefix `prefix`
    If `keys` is omitted, record an event for each bundle in `replica` via lambda forwarding.
    """
    replica = Replica[args.replica]
    job_id = args.job_id or f"{uuid4()}"
    cmd_template = (f"events record --job-id {job_id} "
                    f"--prefix {args.prefix} "
                    f"--replica {replica.name} "
                    f"--keys {{keys}}")

    if args.keys is None:
        start_time = datetime.now()

        def forward_keys(bundle_fqids):
            with SQSMessenger(command_queue_url) as sqsm:
                for fqid in bundle_fqids:
                    sqsm.send(cmd_template.format(keys=f"bundles/{fqid}"))

        handle = Config.get_blobstore_handle(replica)
        with ThreadPoolExecutor(max_workers=4) as e:
            for c in set(hexdigits.lower()):
                bundle_fqids = Living(handle.list_v2(replica.bucket, f"bundles/{c}"))
                e.submit(forward_keys, bundle_fqids)
        monitor_logs(logs, job_id, start_time)
    else:
        for key in args.keys:
            msg = json.dumps(dict(action="record event", job_id=job_id, replica=replica.name, key=key))
            record_event_for_bundle(Replica[args.replica], key, (args.prefix,), use_version_for_timestamp=True)
            print(msg)
示例#7
0
def mark_bundle_checkout_started(execution_id: str, replica: Replica,
                                 sts_bucket: str):
    handle = Config.get_blobstore_handle(replica)
    data = {_STATUS_KEY: "RUNNING"}
    handle.upload_file_handle(sts_bucket,
                              _bundle_checkout_status_key(execution_id),
                              io.BytesIO(json.dumps(data).encode("utf-8")))
    def _walk(self) -> None:
        """
        Subclasses should not typically implement this method, which includes logic specific to calling
        self.process_item(*args) on each blob visited.
        """

        start_time = time()

        handle = Config.get_blobstore_handle(Replica[self.replica])

        blobs = handle.list_v2(
            self.bucket,
            prefix=self.work_id,
            start_after_key=self.
            marker,  # type: ignore  # Cannot determine type of 'marker'
            token=self.
            token  # type: ignore  # Cannot determine type of 'token'
        )

        for key in blobs:
            if 250 < time() - start_time:
                break
            self.process_item(key)
            self.marker = blobs.start_after_key
            self.token = blobs.token
        else:
            self._status = WalkerStatus.finished.name
    def _test_bundle_delete(self, replica: Replica, fixtures_bucket: str, authorized: bool):
        schema = replica.storage_schema

        # prep existing bundle
        bundle_uuid = str(uuid.uuid4())
        file_uuid = str(uuid.uuid4())
        resp_obj = self.upload_file_wait(
            f"{schema}://{fixtures_bucket}/test_good_source_data/0",
            replica,
            file_uuid,
            bundle_uuid=bundle_uuid,
        )
        file_version = resp_obj.json['version']

        bundle_version = datetime_to_version_format(datetime.datetime.utcnow())
        self.put_bundle(
            replica,
            bundle_uuid,
            [(file_uuid, file_version, "LICENSE")],
            bundle_version,
        )

        handle = Config.get_blobstore_handle(replica)
        bucket = replica.bucket

        self.delete_bundle(replica, bundle_uuid, authorized=authorized)
        tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.dead")
        self.assertEquals(tombstone_exists, authorized)

        self.delete_bundle(replica, bundle_uuid, bundle_version, authorized=authorized)
        tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.{bundle_version}.dead")
        self.assertEquals(tombstone_exists, authorized)
 def job_finalize(self):
     super().job_finalize()
     handle = Config.get_blobstore_handle(Replica[self.replica])
     listed_keys = handle.list(self.bucket, prefix=self.prefix)
     k_listed = sum(1 for _ in listed_keys)
     assert self.work_result == k_listed, f'Integration test failed: {self.work_result} != {k_listed}'
     logger.info(
         f"Integration test passed for {self.replica} with {k_listed} key(s) listed"
     )
示例#11
0
def validate_dst_bucket(dst_bucket: str,
                        replica: Replica) -> typing.Tuple[ValidationEnum, str]:
    if not Config.get_blobstore_handle(replica).check_bucket_exists(
            dst_bucket):
        return ValidationEnum.WRONG_DST_BUCKET, f"Bucket {dst_bucket} doesn't exist"
    if not touch_test_file(dst_bucket, replica):
        return ValidationEnum.WRONG_PERMISSIONS_DST_BUCKET, f"Insufficient permissions on bucket {dst_bucket}"

    return ValidationEnum.PASSED, None
示例#12
0
def put_status_succeeded(execution_id: str, dst_replica: Replica,
                         dst_bucket: str, dst_location: str):
    handle = Config.get_blobstore_handle(Replica.aws)
    data = {
        "status": 'SUCCEEDED',
        "location":
        f"{dst_replica.storage_schema}://{dst_bucket}/{dst_location}"
    }
    handle.upload_file_handle(Replica.aws.checkout_bucket,
                              status_file_name(execution_id),
                              io.BytesIO(json.dumps(data).encode("utf-8")))
示例#13
0
    def _walk(self) -> None:
        executor = ThreadPoolExecutor(len(DEFAULT_BACKENDS))
        # We can't use executor as context manager because we don't want shutting it down to block
        try:
            remaining_backend_time = AdjustedRemainingTime(
                actual=self._remaining_time, offset=-self.shutdown_time)
            backend = CompositeIndexBackend(
                executor=executor,
                backends=DEFAULT_BACKENDS,
                remaining_time=remaining_backend_time,
                dryrun=self.dryrun,
                notify=self.notify)
            replica = Replica[self.replica]
            indexer_cls = Indexer.for_replica(replica)
            indexer = indexer_cls(backend, remaining_backend_time)

            handle = Config.get_blobstore_handle(replica)
            if self.bucket != replica.bucket:
                logger.warning(
                    f'Indexing bucket {self.bucket} instead of default {self.bucket}.'
                )

            blobs: PagedIter = handle.list_v2(self.bucket,
                                              prefix=f'bundles/{self.work_id}',
                                              start_after_key=self.marker,
                                              token=self.token)

            for key in blobs:
                # Timing out while recording paging info could cause an inconsistent paging state, leading to repeats
                # of large amounts of work. This can be avoided by checking for timeouts only during actual
                # re-indexing. The indexer performs this check for every item.
                self.work_result['processed'] += 1
                try:
                    indexer.index_object(key)
                except IndexerTimeout as e:
                    self.work_result['failed'] += 1
                    logger.warning(
                        f'{self.work_id} timed out during index visitation: {e}'
                    )
                    break
                except Exception:
                    self.work_result['failed'] += 1
                    logger.warning(f'Index visitation failed for {key}',
                                   exc_info=True)
                else:
                    self.work_result['indexed'] += 1
                    self.marker = blobs.start_after_key
                    self.token = blobs.token
            else:
                self._status = WalkerStatus.finished.name
        finally:
            executor.shutdown(False)
示例#14
0
    def _test_file_put_cached(self, replica: Replica, scheme: str,
                              test_bucket: str, test_checkout_bucket: str,
                              uploader: Uploader):
        stored_cache_criteria = os.environ.get('CHECKOUT_CACHE_CRITERIA')
        try:
            os.environ[
                'CHECKOUT_CACHE_CRITERIA'] = '[{"type":"application/json","max_size":12314}]'
            handle = Config.get_blobstore_handle(replica)
            src_key = generate_test_key()
            src_data = b'{"status":"valid"}'
            source_url = f"{scheme}://{test_bucket}/{src_key}"
            file_uuid = str(uuid.uuid4())
            bundle_uuid = str(uuid.uuid4())
            version = datetime_to_version_format(datetime.datetime.utcnow())

            # write dummy file and upload to upload area
            with tempfile.NamedTemporaryFile(delete=True) as fh:
                fh.write(src_data)
                fh.flush()

                uploader.checksum_and_upload_file(fh.name, src_key,
                                                  "application/json")

            # upload file to DSS
            self.upload_file(source_url,
                             file_uuid,
                             bundle_uuid=bundle_uuid,
                             version=version)

            metadata = handle.get_user_metadata(test_bucket, src_key)
            dst_key = ("blobs/" + ".".join([
                metadata['hca-dss-sha256'], metadata['hca-dss-sha1'],
                metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c']
            ])).lower()

            for wait_to_upload_into_checkout_bucket in range(30):
                try:
                    # get uploaded blob key from the checkout bucket
                    file_metadata = json.loads(
                        handle.get(test_checkout_bucket,
                                   dst_key).decode("utf-8"))
                    break
                except BlobNotFoundError:
                    time.sleep(1)
            else:
                file_metadata = json.loads(
                    handle.get(test_checkout_bucket, dst_key).decode("utf-8"))
            assert file_metadata[
                "status"] == "valid"  # the file exists in the checkout bucket
        finally:
            os.environ['CHECKOUT_CACHE_CRITERIA'] = stored_cache_criteria
示例#15
0
def _list_checkout_bundle(
    replica: Replica,
    bundle_uuid: str,
    bundle_version: typing.Optional[str],
) -> typing.List[typing.Tuple[str, dict]]:
    """
    Lists the contents of a bundle in checkout.
    :param replica: Cloud replica
    :param bundle_uuid: Bundle UUID
    :param bundle_version: Bundle version
    :return: List of checkout bundle contents
    """
    handle = Config.get_blobstore_handle(replica)
    prefix = get_dst_bundle_prefix(bundle_uuid, bundle_version)
    return list(handle.list_v2(replica.checkout_bucket, prefix))
示例#16
0
def delete(uuid: str,
           replica: str,
           json_request_body: dict,
           version: str = None):
    email = request.token_info['email']

    if email not in ADMIN_USER_EMAILS:
        raise DSSException(
            requests.codes.forbidden,
            "forbidden",
            f"You can't delete bundles with these credentials!",
        )

    uuid = uuid.lower()
    version = datetime_to_version_format(
        iso8601.parse_date(version)) if version else None

    tombstone_id = TombstoneID(uuid=uuid, version=version)
    bundle_prefix = tombstone_id.to_key_prefix()
    tombstone_object_data = _create_tombstone_data(
        email=email,
        reason=json_request_body.get('reason'),
        version=version,
    )

    handle = Config.get_blobstore_handle(Replica[replica])
    bucket = Replica[replica].bucket

    if test_object_exists(handle,
                          bucket,
                          bundle_prefix,
                          test_type=ObjectTest.PREFIX):
        created, idempotent = _idempotent_save(handle, bucket,
                                               tombstone_id.to_key(),
                                               tombstone_object_data)
        if not idempotent:
            raise DSSException(
                requests.codes.conflict,
                f"bundle_tombstone_already_exists",
                f"bundle tombstone with UUID {uuid} and version {version} already exists",
            )
        status_code = requests.codes.ok
        response_body = dict()  # type: dict
    else:
        status_code = requests.codes.not_found
        response_body = dict(title="bundle not found")

    return jsonify(response_body), status_code
示例#17
0
def mark_bundle_checkout_successful(
    execution_id: str,
    replica: Replica,
    sts_bucket: str,
    dst_bucket: str,
    dst_location: str,
):
    handle = Config.get_blobstore_handle(replica)
    data = {
        _STATUS_KEY: 'SUCCEEDED',
        _LOCATION_KEY:
        f"{replica.storage_schema}://{dst_bucket}/{dst_location}"
    }
    handle.upload_file_handle(sts_bucket,
                              _bundle_checkout_status_key(execution_id),
                              io.BytesIO(json.dumps(data).encode("utf-8")))
示例#18
0
def _verify_checkout(
        replica: Replica, token: typing.Optional[str], file_metadata: dict, blob_path: str,
) -> typing.Tuple[str, bool]:
    cloud_handle = Config.get_blobstore_handle(replica)
    hca_handle = Config.get_hcablobstore_handle(replica)

    try:
        now = datetime.datetime.now(datetime.timezone.utc)
        creation_date = cloud_handle.get_creation_date(replica.checkout_bucket, blob_path)
        stale_after_date = creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS']))
        expiration_date = (creation_date
                           + datetime.timedelta(days=int(os.environ['DSS_BLOB_TTL_DAYS']))
                           - datetime.timedelta(hours=1))

        if now < expiration_date:
            if now > stale_after_date:
                start_file_checkout(replica, blob_path)
            if hca_handle.verify_blob_checksum_from_dss_metadata(replica.checkout_bucket,
                                                                 blob_path,
                                                                 file_metadata):
                return "", True
            else:
                logger.error(
                    f"Checksum verification failed for file {replica.checkout_bucket}/{blob_path}")
    except BlobNotFoundError:
        pass

    decoded_token: dict
    if token is None:
        execution_id = start_file_checkout(replica, blob_path)
        start_time = time.time()
        attempts = 0

        decoded_token = {
            CheckoutTokenKeys.EXECUTION_ID: execution_id,
            CheckoutTokenKeys.START_TIME: start_time,
            CheckoutTokenKeys.ATTEMPTS: attempts
        }
    else:
        try:
            decoded_token = json.loads(token)
            decoded_token[CheckoutTokenKeys.ATTEMPTS] += 1
        except (KeyError, ValueError) as ex:
            raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex)

    encoded_token = json.dumps(decoded_token)
    return encoded_token, False
示例#19
0
def build_bundle_metadata_document(replica: Replica, key: str) -> dict:
    """
    This returns a JSON document with bundle manifest and metadata files suitable for JMESPath filters.
    """
    handle = Config.get_blobstore_handle(replica)
    manifest = json.loads(handle.get(replica.bucket, key).decode("utf-8"))
    fqid = BundleFQID.from_key(key)
    bundle_info = dict(uuid=fqid.uuid, version=fqid.version)
    if key.endswith(TOMBSTONE_SUFFIX):
        return dict(event_type="TOMBSTONE",
                    bundle_info=bundle_info,
                    **manifest)
    else:
        lock = threading.Lock()
        files: dict = defaultdict(list)

        def _read_file(file_metadata):
            blob_key = "blobs/{}.{}.{}.{}".format(
                file_metadata['sha256'],
                file_metadata['sha1'],
                file_metadata['s3-etag'],
                file_metadata['crc32c'],
            )
            contents = handle.get(replica.bucket, blob_key).decode("utf-8")
            try:
                file_info = json.loads(contents)
            except json.decoder.JSONDecodeError:
                logging.info(f"{file_metadata['name']} not json decodable")
            else:
                # Modify name to avoid confusion with JMESPath syntax
                name = _dot_to_underscore_and_strip_numeric_suffix(
                    file_metadata['name'])
                with lock:
                    files[name].append(file_info)

        # TODO: Consider scaling parallelization with Lambda size
        with ThreadPoolExecutor(max_workers=4) as e:
            e.map(_read_file, [
                file_metadata for file_metadata in manifest['files']
                if file_metadata['content-type'].startswith("application/json")
            ])

        return dict(event_type="CREATE",
                    bundle_info=bundle_info,
                    manifest=manifest,
                    files=dict(files))
示例#20
0
    def walker_walk(self) -> None:
        columns = []
        for replica, bucket, key, token in zip(self.replicas, self.buckets,
                                               self.row, self.tokens):
            replica = Replica[replica]
            if bucket is None:
                bucket = replica.bucket
            elif bucket != replica.bucket:
                logger.warning(
                    f'Checking bucket {bucket} instead of default {replica.bucket} for replica {replica}.'
                )
            handle = Config.get_blobstore_handle(replica)
            column: PagedIter = handle.list_v2(bucket,
                                               prefix='bundles/' +
                                               self.work_id,
                                               token=token,
                                               start_after_key=key)
            columns.append(column)

        diff = zipalign(columns=map(iter, columns), row=self.row)
        while self.shutdown_time < self.remaining_runtime():
            try:
                row = next(diff)
            except StopIteration:
                logger.info("Finished checking replicas.")
                self._status = WalkerStatus.finished.name
                break
            else:
                for i, key in enumerate(row.norm()):
                    replica = self.replicas[i]
                    if key is None:
                        logger.warning(
                            f"Replica {replica} is missing {row.min}")
                        self.work_result['missing'][i] += 1
                    else:
                        logger.debug(f"Replica {replica} contains {key}")
                        self.work_result['present'][i] += 1
                self.row = row.values
        else:
            self.tokens = [column.token for column in columns]
            logger.debug('Not enough time left in lambda execution, exiting.')
示例#21
0
    def _walk(self) -> None:
        executor = ThreadPoolExecutor(len(DEFAULT_BACKENDS))
        # We can't use executor as context manager because we don't want shutting it down to block
        try:
            backend = CompositeIndexBackend(executor, DEFAULT_BACKENDS, dryrun=self.dryrun, notify=self.notify,
                                            context=self._context)
            indexer_cls = Indexer.for_replica(Replica[self.replica])
            indexer = indexer_cls(backend)

            handle = Config.get_blobstore_handle(Replica[self.replica])
            default_bucket = Replica[self.replica].bucket

            if self.bucket != default_bucket:
                logger.warning(f'Indexing bucket {self.bucket} instead of default {default_bucket}.')

            blobs = handle.list_v2(
                self.bucket,
                prefix=f'bundles/{self.work_id}',
                start_after_key=self.marker,  # type: ignore  # Cannot determine type of 'marker'
                token=self.token  # type: ignore  # Cannot determine type of 'token'
            )

            for key in blobs:
                # Timing out while recording paging info could cause an inconsistent paging state, leading to repeats
                # of large amounts of work. This can be avoided by checking for timeouts only during actual
                # re-indexing.
                timeout = self.remaining_runtime() - 10  # ten seconds of safety for letting lambda shut down
                if timeout < 10:  # don't even try to index an item with less then 10 seconds left
                    logger.warning(f'{self.work_id} timed out during reindex')
                    return
                backend._timeout = timeout
                self.process_item(indexer, key)
                self.marker = blobs.start_after_key
                self.token = blobs.token
            else:
                self._status = WalkerStatus.finished.name
        finally:
            executor.shutdown(False)
示例#22
0
def delete(uuid: str,
           replica: str,
           json_request_body: dict,
           version: str = None):
    email = security.get_token_email(request.token_info)

    if email not in ADMIN_USER_EMAILS:
        raise DSSForbiddenException(
            "You can't delete bundles with these credentials!")

    uuid = uuid.lower()
    tombstone_id = BundleTombstoneID(uuid=uuid, version=version)
    bundle_prefix = tombstone_id.to_key_prefix()
    tombstone_object_data = _create_tombstone_data(
        email=email,
        reason=json_request_body.get('reason'),
        version=version,
    )

    handle = Config.get_blobstore_handle(Replica[replica])
    if not test_object_exists(handle,
                              Replica[replica].bucket,
                              bundle_prefix,
                              test_type=ObjectTest.PREFIX):
        raise DSSException(404, "not_found", "Cannot find bundle!")

    created, idempotent = idempotent_save(
        handle, Replica[replica].bucket, tombstone_id.to_key(),
        json.dumps(tombstone_object_data).encode("utf-8"))
    if not idempotent:
        raise DSSException(
            requests.codes.conflict,
            f"bundle_tombstone_already_exists",
            f"bundle tombstone with UUID {uuid} and version {version} already exists",
        )

    return dict(), requests.codes.ok
示例#23
0
def get(
    uuid: str,
    replica: str,
    per_page: int,
    version: str = None,
    directurls: bool = False,
    presignedurls: bool = False,
    token: str = None,
    start_at: int = 0,
):
    if directurls and presignedurls:
        raise DSSException(
            requests.codes.bad_request, "only_one_urltype",
            "only enable one of `directurls` or `presignedurls`")

    _replica = Replica[replica]
    bundle_metadata = get_bundle_manifest(uuid, _replica, version)
    if bundle_metadata is None:
        raise DSSException(404, "not_found", "Cannot find bundle!")
    if version is None:
        version = bundle_metadata[BundleMetadata.VERSION]

    if directurls or presignedurls:
        try:
            token, ready = verify_checkout(_replica, uuid, version, token)
        except TokenError as ex:
            raise DSSException(requests.codes.bad_request, "illegal_token",
                               "Could not understand token", ex)
        except CheckoutError as ex:
            raise DSSException(requests.codes.server_error, "checkout_error",
                               "Could not complete checkout", ex)
        if not ready:
            builder = UrlBuilder(request.url)
            builder.replace_query("token", token)
            response = redirect(str(builder), code=requests.codes.moved)
            headers = response.headers
            headers['Retry-After'] = RETRY_AFTER_INTERVAL
            return response

    all_files = bundle_metadata[BundleMetadata.FILES]

    link = None
    if len(all_files) - start_at > per_page:
        next_url = UrlBuilder(request.url)
        next_url.replace_query("start_at", str(start_at + per_page))
        next_url.replace_query("version", version)
        next_url.replace_query("token", token)
        link = f"<{next_url}>; rel='next'"

    files = all_files[start_at:start_at + per_page]

    filesresponse = []  # type: typing.List[dict]
    for _file in files:
        file_version = {
            'name': _file[BundleFileMetadata.NAME],
            'content-type': _file[BundleFileMetadata.CONTENT_TYPE],
            'size': _file[BundleFileMetadata.SIZE],
            'uuid': _file[BundleFileMetadata.UUID],
            'version': _file[BundleFileMetadata.VERSION],
            'crc32c': _file[BundleFileMetadata.CRC32C],
            's3_etag': _file[BundleFileMetadata.S3_ETAG],
            'sha1': _file[BundleFileMetadata.SHA1],
            'sha256': _file[BundleFileMetadata.SHA256],
            'indexed': _file[BundleFileMetadata.INDEXED],
        }
        if directurls:
            file_version['url'] = str(UrlBuilder().set(
                scheme=_replica.storage_schema,
                netloc=_replica.checkout_bucket,
                path="{}/{}".format(
                    get_dst_bundle_prefix(
                        uuid, bundle_metadata[BundleMetadata.VERSION]),
                    _file[BundleFileMetadata.NAME],
                ),
            ))
        elif presignedurls:
            handle = Config.get_blobstore_handle(_replica)
            file_version['url'] = handle.generate_presigned_GET_url(
                _replica.checkout_bucket,
                "{}/{}".format(
                    get_dst_bundle_prefix(
                        uuid, bundle_metadata[BundleMetadata.VERSION]),
                    _file[BundleFileMetadata.NAME],
                ),
            )
        filesresponse.append(file_version)

    response_body = dict(bundle=dict(
        uuid=uuid,
        version=bundle_metadata[BundleMetadata.VERSION],
        files=filesresponse,
        creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID],
    ))

    if link is None:
        response = make_response(jsonify(response_body), requests.codes.ok)
        response.headers['X-OpenAPI-Pagination'] = 'false'
    else:
        response = make_response(jsonify(response_body),
                                 requests.codes.partial)
        response.headers['X-OpenAPI-Pagination'] = 'true'
        response.headers['Link'] = link

    response.headers['X-OpenAPI-Paginated-Content-Key'] = 'bundle.files'
    return response
示例#24
0
def build_bundle_file_metadata(replica: Replica, user_supplied_files: dict):
    handle = Config.get_blobstore_handle(replica)

    time_left = nestedcontext.inject("time_left")

    # decode the list of files.
    files = [{
        'user_supplied_metadata': _file
    } for _file in user_supplied_files]

    def _get_file_metadata(_file):
        metadata_key = FileFQID(
            uuid=_file['user_supplied_metadata']['uuid'],
            version=_file['user_supplied_metadata']['version'],
        ).to_key()
        while True:
            try:
                file_metadata = handle.get(replica.bucket, metadata_key)
            except BlobNotFoundError:
                if time_left() > PUT_TIME_ALLOWANCE_SECONDS:
                    time.sleep(1)
                else:
                    break
            else:
                return json.loads(file_metadata)
        return None

    # TODO: Consider scaling parallelization with Lambda size
    with ThreadPoolExecutor(max_workers=20) as e:
        futures = {
            e.submit(_get_file_metadata, _file): _file
            for _file in files
        }
        for future in as_completed(futures):
            _file = futures[future]
            res = future.result()
            if res is not None:
                _file['file_metadata'] = res
            else:
                missing_file_user_metadata = _file['user_supplied_metadata']
                raise DSSException(
                    requests.codes.bad_request, "file_missing",
                    f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}."
                )

    return [{
        BundleFileMetadata.NAME:
        _file['user_supplied_metadata']['name'],
        BundleFileMetadata.UUID:
        _file['user_supplied_metadata']['uuid'],
        BundleFileMetadata.VERSION:
        _file['user_supplied_metadata']['version'],
        BundleFileMetadata.CONTENT_TYPE:
        _file['file_metadata'][FileMetadata.CONTENT_TYPE],
        BundleFileMetadata.SIZE:
        _file['file_metadata'][FileMetadata.SIZE],
        BundleFileMetadata.INDEXED:
        _file['user_supplied_metadata']['indexed'],
        BundleFileMetadata.CRC32C:
        _file['file_metadata'][FileMetadata.CRC32C],
        BundleFileMetadata.S3_ETAG:
        _file['file_metadata'][FileMetadata.S3_ETAG],
        BundleFileMetadata.SHA1:
        _file['file_metadata'][FileMetadata.SHA1],
        BundleFileMetadata.SHA256:
        _file['file_metadata'][FileMetadata.SHA256],
    } for _file in files]
示例#25
0
def get_bundle_checkout_status(execution_id: str, replica: Replica,
                               sts_bucket: str):
    handle = Config.get_blobstore_handle(replica)
    return json.loads(
        handle.get(sts_bucket, _bundle_checkout_status_key(execution_id)))
示例#26
0
def validate_dst_bucket(replica: Replica, dst_bucket: str) -> bool:
    if not Config.get_blobstore_handle(replica).check_bucket_exists(dst_bucket):
        raise DestinationBucketNotFoundError(f"Bucket {dst_bucket} doesn't exist")
    return touch_test_file(replica, dst_bucket)
示例#27
0
def put(uuid: str, json_request_body: dict, version: str = None):
    class CopyMode(Enum):
        NO_COPY = auto()
        COPY_INLINE = auto()
        COPY_ASYNC = auto()

    uuid = uuid.lower()
    if version is not None:
        # convert it to date-time so we can format exactly as the system requires (with microsecond precision)
        timestamp = iso8601.parse_date(version)
    else:
        timestamp = datetime.datetime.utcnow()
    version = datetime_to_version_format(timestamp)

    source_url = json_request_body['source_url']
    cre = re.compile("^"
                     "(?P<schema>(?:s3|gs|wasb))"
                     "://"
                     "(?P<bucket>[^/]+)"
                     "/"
                     "(?P<key>.+)"
                     "$")
    mobj = cre.match(source_url)
    if mobj and mobj.group('schema') == "s3":
        replica = Replica.aws
    elif mobj and mobj.group('schema') == "gs":
        replica = Replica.gcp
    else:
        schema = mobj.group('schema')
        raise DSSException(requests.codes.bad_request, "unknown_source_schema",
                           f"source_url schema {schema} not supported")

    handle = Config.get_blobstore_handle(replica)
    hca_handle = Config.get_hcablobstore_handle(replica)
    dst_bucket = replica.bucket

    src_bucket = mobj.group('bucket')
    src_key = mobj.group('key')

    metadata = handle.get_user_metadata(src_bucket, src_key)
    size = handle.get_size(src_bucket, src_key)
    content_type = handle.get_content_type(src_bucket, src_key)

    # format all the checksums so they're lower-case.
    for metadata_spec in HCABlobStore.MANDATORY_METADATA.values():
        if metadata_spec['downcase']:
            keyname = typing.cast(str, metadata_spec['keyname'])
            metadata[keyname] = metadata[keyname].lower()

    # what's the target object name for the actual data?
    dst_key = ("blobs/" + ".".join((
        metadata['hca-dss-sha256'],
        metadata['hca-dss-sha1'],
        metadata['hca-dss-s3_etag'],
        metadata['hca-dss-crc32c'],
    ))).lower()

    # does it exist? if so, we can skip the copy part.
    copy_mode = CopyMode.COPY_INLINE
    try:
        if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata):
            copy_mode = CopyMode.NO_COPY
    except BlobNotFoundError:
        pass

    # build the json document for the file metadata.
    file_metadata = {
        FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION,
        FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'],
        FileMetadata.CREATOR_UID: json_request_body['creator_uid'],
        FileMetadata.VERSION: version,
        FileMetadata.CONTENT_TYPE: content_type,
        FileMetadata.SIZE: size,
        FileMetadata.CRC32C: metadata['hca-dss-crc32c'],
        FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'],
        FileMetadata.SHA1: metadata['hca-dss-sha1'],
        FileMetadata.SHA256: metadata['hca-dss-sha256'],
    }
    file_metadata_json = json.dumps(file_metadata)

    if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD:
        copy_mode = CopyMode.COPY_ASYNC

    if copy_mode == CopyMode.COPY_ASYNC:
        if replica == Replica.aws:
            state = s3copyclient.copy_write_metadata_sfn_event(
                src_bucket,
                src_key,
                dst_bucket,
                dst_key,
                uuid,
                version,
                file_metadata_json,
            )
            state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}"
        elif replica == Replica.gcp:
            state = gscopyclient.copy_write_metadata_sfn_event(
                src_bucket,
                src_key,
                dst_bucket,
                dst_key,
                uuid,
                version,
                file_metadata_json,
            )
            state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}"
        else:
            raise ValueError("Unhandled replica")

        execution_id = str(uuid4())
        stepfunctions.step_functions_invoke(state_machine_name_template,
                                            execution_id, state)
        return jsonify(dict(task_id=execution_id,
                            version=version)), requests.codes.accepted
    elif copy_mode == CopyMode.COPY_INLINE:
        handle.copy(src_bucket, src_key, dst_bucket, dst_key)

        # verify the copy was done correctly.
        assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata)

    try:
        write_file_metadata(handle, dst_bucket, uuid, version,
                            file_metadata_json)
        status_code = requests.codes.created
    except BlobAlreadyExistsError:
        # fetch the file metadata, compare it to what we have.
        existing_file_metadata = json.loads(
            handle.get(dst_bucket,
                       "files/{}.{}".format(uuid, version)).decode("utf-8"))
        if existing_file_metadata != file_metadata:
            raise DSSException(
                requests.codes.conflict, "file_already_exists",
                f"file with UUID {uuid} and version {version} already exists")
        status_code = requests.codes.ok

    return jsonify(dict(version=version)), status_code
    def _test_bundle_put(self, replica: Replica, fixtures_bucket: str):
        schema = replica.storage_schema

        bundle_uuid = str(uuid.uuid4())
        file_uuid = str(uuid.uuid4())
        missing_file_uuid = str(uuid.uuid4())
        resp_obj = self.upload_file_wait(
            f"{schema}://{fixtures_bucket}/test_good_source_data/0",
            replica,
            file_uuid,
            bundle_uuid=bundle_uuid,
        )
        file_version = resp_obj.json['version']

        # first bundle.
        bundle_version = datetime_to_version_format(datetime.datetime.utcnow())
        self.put_bundle(
            replica,
            bundle_uuid,
            [(file_uuid, file_version, "LICENSE")],
            bundle_version,
        )

        # should be able to do this twice (i.e., same payload, same UUIDs)
        self.put_bundle(
            replica,
            bundle_uuid,
            [(file_uuid, file_version, "LICENSE")],
            bundle_version,
            requests.codes.ok,
        )

        # should *NOT* be able to do this twice with different payload.
        self.put_bundle(
            replica,
            bundle_uuid,
            [(file_uuid, file_version, "LICENSE1")],
            bundle_version,
            requests.codes.conflict,
        )

        # should *NOT* be able to upload a bundle with a missing file, but we should get requests.codes.conflict.
        with nestedcontext.bind(time_left=lambda: 0):
            resp_obj = self.put_bundle(
                replica,
                bundle_uuid,
                [
                    (file_uuid, file_version, "LICENSE0"),
                    (missing_file_uuid, file_version, "LICENSE1"),
                ],
                expected_code=requests.codes.conflict,
            )
            self.assertEqual(resp_obj.json['code'], "file_missing")

        # should *NOT* be able to upload a bundle containing a file with an incorrect bundle_uuid
        # but we should get requests.codes.conflict
        with nestedcontext.bind(time_left=lambda: 0):
            resp_obj = self.put_bundle(
                replica,
                str(uuid.uuid4()),  # uploading new bundle with old file
                [(file_uuid, file_version, "LICENSE")],
                datetime_to_version_format(datetime.datetime.utcnow()),
                expected_code=requests.codes.conflict,
            )
            self.assertEqual(resp_obj.json['code'], "incorrect_file_bundle_uuid")

        # uploads a file, but delete the file metadata. put it back after a delay.
        self.upload_file_wait(
            f"{schema}://{fixtures_bucket}/test_good_source_data/0",
            replica,
            missing_file_uuid,
            file_version,
            bundle_uuid=bundle_uuid
        )
        handle = Config.get_blobstore_handle(replica)
        bucket = replica.bucket
        file_metadata = handle.get(bucket, f"files/{missing_file_uuid}.{file_version}")
        handle.delete(bucket, f"files/{missing_file_uuid}.{file_version}")

        class UploadThread(threading.Thread):
            def run(innerself):
                time.sleep(5)
                data_fh = io.BytesIO(file_metadata)
                handle.upload_file_handle(bucket, f"files/{missing_file_uuid}.{file_version}", data_fh)

        # start the upload (on a delay...)
        upload_thread = UploadThread()
        upload_thread.start()

        # this should at first fail to find one of the files, but the UploadThread will eventually upload the file
        # metadata.  since we give the upload bundle process ample time to spin, it should eventually find the file
        # metadata and succeed.
        with nestedcontext.bind(time_left=lambda: sys.maxsize):
            self.put_bundle(
                replica,
                bundle_uuid,
                [
                    (file_uuid, file_version, "LICENSE0"),
                    (missing_file_uuid, file_version, "LICENSE1"),
                ],
                expected_code=requests.codes.created,
            )
示例#29
0
    def _test_file_get_checkout(self, replica: Replica, scheme: str,
                                test_bucket: str, uploader: Uploader):
        handle = Config.get_blobstore_handle(replica)
        src_key = generate_test_key()
        src_data = os.urandom(1024)
        source_url = f"{scheme}://{test_bucket}/{src_key}"
        file_uuid = str(uuid.uuid4())
        bundle_uuid = str(uuid.uuid4())
        version = datetime_to_version_format(datetime.datetime.utcnow())

        # write dummy file and upload to upload area
        with tempfile.NamedTemporaryFile(delete=True) as fh:
            fh.write(src_data)
            fh.flush()

            uploader.checksum_and_upload_file(fh.name, src_key, "text/plain")

        # upload file to DSS
        self.upload_file(source_url,
                         file_uuid,
                         bundle_uuid=bundle_uuid,
                         version=version)
        url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query(
            "replica", replica.name).add_query("version", version))

        # get uploaded blob key
        file_metadata = json.loads(
            handle.get(test_bucket,
                       f"files/{file_uuid}.{version}").decode("utf-8"))
        file_key = compose_blob_key(file_metadata)

        @eventually(20, 1)
        def test_checkout():
            # assert 302 and verify checksum on checkout completion
            api_get = self.assertGetResponse(url,
                                             requests.codes.found,
                                             headers=get_auth_header(),
                                             redirect_follow_retries=0)
            file_get = requests.get(api_get.response.headers['Location'])
            self.assertTrue(file_get.ok)
            self.assertEquals(file_get.content, src_data)

        with self.subTest(
                f"{replica}: Initiates checkout and returns 301 for GET on 'uncheckedout' file."
        ):
            # assert 301 redirect on first GET
            self.assertGetResponse(url,
                                   requests.codes.moved,
                                   headers=get_auth_header(),
                                   redirect_follow_retries=0)
            test_checkout()

        with self.subTest(
                f"{replica}: Initiates checkout and returns 301 for GET on nearly expired checkout file."
        ):
            now = datetime.datetime.now(datetime.timezone.utc)
            creation_date_fn = (
                "cloud_blobstore.s3.S3BlobStore.get_creation_date"
                if replica.name == "aws" else
                "cloud_blobstore.gs.GSBlobStore.get_creation_date")
            with mock.patch(creation_date_fn) as mock_creation_date:
                blob_ttl_days = int(os.environ['DSS_BLOB_TTL_DAYS'])
                mock_creation_date.return_value = now - datetime.timedelta(
                    days=blob_ttl_days, hours=1, minutes=5)
                self.assertGetResponse(url,
                                       requests.codes.moved,
                                       headers=get_auth_header(),
                                       redirect_follow_retries=0)
            test_checkout()

        with self.subTest(
                f"{replica}: Initiates checkout and returns 302 immediately for GET on stale checkout file."
        ):
            now = datetime.datetime.now(datetime.timezone.utc)
            creation_date = handle.get_creation_date(replica.checkout_bucket,
                                                     file_key)
            creation_date_fn = (
                "cloud_blobstore.s3.S3BlobStore.get_creation_date"
                if replica.name == "aws" else
                "cloud_blobstore.gs.GSBlobStore.get_creation_date")
            with mock.patch(creation_date_fn) as mock_creation_date:
                # assert 302 found on stale file and that last modified refreshes
                blob_ttl_days = int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS'])
                mock_creation_date.return_value = now - datetime.timedelta(
                    days=blob_ttl_days + 1)
                self.assertGetResponse(url,
                                       requests.codes.found,
                                       headers=get_auth_header(),
                                       redirect_follow_retries=0)
                self.assertTrue(
                    creation_date > handle.get_creation_date(
                        replica.checkout_bucket, file_key),
                    f'\ncurr_creation_date: {creation_date}'
                    f'\nprev_creation_date: {handle.get_creation_date(replica.checkout_bucket)}'
                )

        handle.delete(test_bucket, f"files/{file_uuid}.{version}")
        handle.delete(replica.checkout_bucket, file_key)
示例#30
0
    def _test_file_get_direct(self, replica: Replica):
        """
        Verify that the direct URL option works for GET/ file
        """
        file_uuid = "ce55fd51-7833-469b-be0b-5da88ebebfcd"
        handle = Config.get_blobstore_handle(replica)

        direct_url_req = str(
            UrlBuilder().set(path="/v1/files/" + file_uuid).add_query(
                "replica", replica.name).add_query("directurl", "True"))
        presigned_url_req = str(UrlBuilder().set(
            path="/v1/files/" + file_uuid).add_query("replica", replica.name))
        with override_bucket_config(BucketConfig.TEST_FIXTURE):
            native_resp_obj = self.assertGetResponse(
                direct_url_req,
                requests.codes.found,
                headers=get_auth_header(),
                redirect_follow_retries=FILE_GET_RETRY_COUNT,
                min_retry_interval_header=RETRY_AFTER_INTERVAL,
                override_retry_interval=1,
            )
            resp_obj = self.assertGetResponse(
                presigned_url_req,
                requests.codes.found,
                headers=get_auth_header(),
                redirect_follow_retries=FILE_GET_RETRY_COUNT,
                min_retry_interval_header=RETRY_AFTER_INTERVAL,
                override_retry_interval=1,
            )

            verify_headers = [
                'X-DSS-VERSION', 'X-DSS-CREATOR-UID', 'X-DSS-S3-ETAG',
                'X-DSS-SHA256', 'X-DSS-SHA1', 'X-DSS-CRC32C'
            ]
            native_headers_verify = {
                k: v
                for k, v in native_resp_obj.response.headers.items()
                if k in verify_headers
            }
            presigned_headers_verify = {
                k: v
                for k, v in resp_obj.response.headers.items()
                if k in verify_headers
            }
            self.assertDictEqual(native_headers_verify,
                                 presigned_headers_verify)

            with self.subTest(
                    'Retry-After headers are not included in a successful response.'
            ):
                self.assertEqual(
                    native_resp_obj.response.headers.get('Retry-After'), None)

            self.assertTrue(native_resp_obj.response.headers['Location'].split(
                '//')[0].startswith(replica.storage_schema))
            self.assertTrue(native_resp_obj.response.headers['Location'].split(
                '//')[1].startswith(replica.checkout_bucket))
            blob_path = native_resp_obj.response.headers['Location'].split(
                '/blobs/')[1]
            native_size = handle.get_size(replica.checkout_bucket,
                                          f'blobs/{blob_path}')
            self.assertGreater(native_size, 0)
            self.assertEqual(native_size,
                             int(resp_obj.response.headers['X-DSS-SIZE']))