def touch_test_file(replica: Replica, dst_bucket: str) -> bool: """ Write a test file into the specified bucket. :param dst_bucket: the bucket to be checked. :param replica: the replica to execute the checkout in. :return: True if able to write, if not raise DestinationBucketNotWritableError. """ randomizer = ''.join(choices(hexdigits, k=2)) # Spreading the touch test file across a larger range prevents hitting a modification rate limits. test_object = f"touch/{randomizer}.txt" handle = Config.get_blobstore_handle(replica) try: handle.upload_file_handle( dst_bucket, test_object, io.BytesIO(b"")) return True except Exception as ex: raise DestinationBucketNotWritableError(ex) finally: try: Config.get_blobstore_handle(replica).delete(dst_bucket, test_object) except Exception: pass
def validate_file_dst(dst_bucket: str, dst_key: str, replica: Replica): try: Config.get_blobstore_handle(replica).get_user_metadata( dst_bucket, dst_key) return True except (BlobNotFoundError, BlobStoreUnknownError): return False
def touch_test_file(dst_bucket: str, replica: Replica) -> bool: """ Write a test file into the specified bucket. :param bucket: the bucket to be checked. :return: True if able to write, if not also returns error message as a cause """ test_object = "touch.txt" handle = Config.get_blobstore_handle(replica) try: handle.upload_file_handle(dst_bucket, test_object, io.BytesIO(b"")) Config.get_blobstore_handle(replica).delete(dst_bucket, test_object) return True except Exception as e: return False
def get_helper(uuid: str, replica: Replica, version: str = None): handle = Config.get_blobstore_handle(replica) bucket = replica.bucket if version is None: # list the files and find the one that is the most recent. prefix = "files/{}.".format(uuid) for matching_file in handle.list(bucket, prefix): matching_file = matching_file[len(prefix):] if version is None or matching_file > version: version = matching_file if version is None: # no matches! raise DSSException(404, "not_found", "Cannot find file!") # retrieve the file metadata. try: file_metadata = json.loads( handle.get(bucket, "files/{}.{}".format(uuid, version)).decode("utf-8")) except BlobNotFoundError as ex: raise DSSException(404, "not_found", "Cannot find file!") blob_path = "blobs/" + ".".join(( file_metadata[FileMetadata.SHA256], file_metadata[FileMetadata.SHA1], file_metadata[FileMetadata.S3_ETAG], file_metadata[FileMetadata.CRC32C], )) if request.method == "GET": """ Probabilistically return "Retry-After" header The retry-after interval can be relatively short now, but it sets up downstream libraries / users for success when we start integrating this with the checkout service. """ if random.randint(0, 100) < REDIRECT_PROBABILITY_PERCENTS: response = redirect(request.url, code=301) headers = response.headers headers['Retry-After'] = RETRY_AFTER_INTERVAL return response response = redirect( handle.generate_presigned_GET_url(bucket, blob_path)) else: response = make_response('', 200) headers = response.headers headers['X-DSS-BUNDLE-UUID'] = file_metadata[FileMetadata.BUNDLE_UUID] headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID] headers['X-DSS-VERSION'] = version headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE] headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE] headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C] headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG] headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1] headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256] return response
def _test_bundle_get_directaccess(self, replica: Replica): schema = replica.storage_schema bundle_uuid = "011c7340-9b3c-4d62-bf49-090d79daf198" version = "2017-06-20T214506.766634Z" url = str(UrlBuilder() .set(path="/v1/bundles/" + bundle_uuid) .add_query("replica", replica.name) .add_query("version", version) .add_query("directurls", "true")) with override_bucket_config(BucketConfig.TEST_FIXTURE): resp_obj = self.assertGetResponse( url, requests.codes.ok) url = resp_obj.json['bundle']['files'][0]['url'] splitted = urllib.parse.urlparse(url) self.assertEqual(splitted.scheme, schema) bucket = splitted.netloc key = splitted.path[1:] # ignore the / part of the path. handle = Config.get_blobstore_handle(replica) contents = handle.get(bucket, key) hasher = hashlib.sha1() hasher.update(contents) sha1 = hasher.hexdigest() self.assertEqual(sha1, "2b8b815229aa8a61e483fb4ba0588b8b6c491890")
def record(argv: typing.List[str], args: argparse.Namespace): """ Record events for `keys` into flashflood prefix `prefix` If `keys` is omitted, record an event for each bundle in `replica` via lambda forwarding. """ replica = Replica[args.replica] job_id = args.job_id or f"{uuid4()}" cmd_template = (f"events record --job-id {job_id} " f"--prefix {args.prefix} " f"--replica {replica.name} " f"--keys {{keys}}") if args.keys is None: start_time = datetime.now() def forward_keys(bundle_fqids): with SQSMessenger(command_queue_url) as sqsm: for fqid in bundle_fqids: sqsm.send(cmd_template.format(keys=f"bundles/{fqid}")) handle = Config.get_blobstore_handle(replica) with ThreadPoolExecutor(max_workers=4) as e: for c in set(hexdigits.lower()): bundle_fqids = Living(handle.list_v2(replica.bucket, f"bundles/{c}")) e.submit(forward_keys, bundle_fqids) monitor_logs(logs, job_id, start_time) else: for key in args.keys: msg = json.dumps(dict(action="record event", job_id=job_id, replica=replica.name, key=key)) record_event_for_bundle(Replica[args.replica], key, (args.prefix,), use_version_for_timestamp=True) print(msg)
def mark_bundle_checkout_started(execution_id: str, replica: Replica, sts_bucket: str): handle = Config.get_blobstore_handle(replica) data = {_STATUS_KEY: "RUNNING"} handle.upload_file_handle(sts_bucket, _bundle_checkout_status_key(execution_id), io.BytesIO(json.dumps(data).encode("utf-8")))
def _walk(self) -> None: """ Subclasses should not typically implement this method, which includes logic specific to calling self.process_item(*args) on each blob visited. """ start_time = time() handle = Config.get_blobstore_handle(Replica[self.replica]) blobs = handle.list_v2( self.bucket, prefix=self.work_id, start_after_key=self. marker, # type: ignore # Cannot determine type of 'marker' token=self. token # type: ignore # Cannot determine type of 'token' ) for key in blobs: if 250 < time() - start_time: break self.process_item(key) self.marker = blobs.start_after_key self.token = blobs.token else: self._status = WalkerStatus.finished.name
def _test_bundle_delete(self, replica: Replica, fixtures_bucket: str, authorized: bool): schema = replica.storage_schema # prep existing bundle bundle_uuid = str(uuid.uuid4()) file_uuid = str(uuid.uuid4()) resp_obj = self.upload_file_wait( f"{schema}://{fixtures_bucket}/test_good_source_data/0", replica, file_uuid, bundle_uuid=bundle_uuid, ) file_version = resp_obj.json['version'] bundle_version = datetime_to_version_format(datetime.datetime.utcnow()) self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE")], bundle_version, ) handle = Config.get_blobstore_handle(replica) bucket = replica.bucket self.delete_bundle(replica, bundle_uuid, authorized=authorized) tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.dead") self.assertEquals(tombstone_exists, authorized) self.delete_bundle(replica, bundle_uuid, bundle_version, authorized=authorized) tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.{bundle_version}.dead") self.assertEquals(tombstone_exists, authorized)
def job_finalize(self): super().job_finalize() handle = Config.get_blobstore_handle(Replica[self.replica]) listed_keys = handle.list(self.bucket, prefix=self.prefix) k_listed = sum(1 for _ in listed_keys) assert self.work_result == k_listed, f'Integration test failed: {self.work_result} != {k_listed}' logger.info( f"Integration test passed for {self.replica} with {k_listed} key(s) listed" )
def validate_dst_bucket(dst_bucket: str, replica: Replica) -> typing.Tuple[ValidationEnum, str]: if not Config.get_blobstore_handle(replica).check_bucket_exists( dst_bucket): return ValidationEnum.WRONG_DST_BUCKET, f"Bucket {dst_bucket} doesn't exist" if not touch_test_file(dst_bucket, replica): return ValidationEnum.WRONG_PERMISSIONS_DST_BUCKET, f"Insufficient permissions on bucket {dst_bucket}" return ValidationEnum.PASSED, None
def put_status_succeeded(execution_id: str, dst_replica: Replica, dst_bucket: str, dst_location: str): handle = Config.get_blobstore_handle(Replica.aws) data = { "status": 'SUCCEEDED', "location": f"{dst_replica.storage_schema}://{dst_bucket}/{dst_location}" } handle.upload_file_handle(Replica.aws.checkout_bucket, status_file_name(execution_id), io.BytesIO(json.dumps(data).encode("utf-8")))
def _walk(self) -> None: executor = ThreadPoolExecutor(len(DEFAULT_BACKENDS)) # We can't use executor as context manager because we don't want shutting it down to block try: remaining_backend_time = AdjustedRemainingTime( actual=self._remaining_time, offset=-self.shutdown_time) backend = CompositeIndexBackend( executor=executor, backends=DEFAULT_BACKENDS, remaining_time=remaining_backend_time, dryrun=self.dryrun, notify=self.notify) replica = Replica[self.replica] indexer_cls = Indexer.for_replica(replica) indexer = indexer_cls(backend, remaining_backend_time) handle = Config.get_blobstore_handle(replica) if self.bucket != replica.bucket: logger.warning( f'Indexing bucket {self.bucket} instead of default {self.bucket}.' ) blobs: PagedIter = handle.list_v2(self.bucket, prefix=f'bundles/{self.work_id}', start_after_key=self.marker, token=self.token) for key in blobs: # Timing out while recording paging info could cause an inconsistent paging state, leading to repeats # of large amounts of work. This can be avoided by checking for timeouts only during actual # re-indexing. The indexer performs this check for every item. self.work_result['processed'] += 1 try: indexer.index_object(key) except IndexerTimeout as e: self.work_result['failed'] += 1 logger.warning( f'{self.work_id} timed out during index visitation: {e}' ) break except Exception: self.work_result['failed'] += 1 logger.warning(f'Index visitation failed for {key}', exc_info=True) else: self.work_result['indexed'] += 1 self.marker = blobs.start_after_key self.token = blobs.token else: self._status = WalkerStatus.finished.name finally: executor.shutdown(False)
def _test_file_put_cached(self, replica: Replica, scheme: str, test_bucket: str, test_checkout_bucket: str, uploader: Uploader): stored_cache_criteria = os.environ.get('CHECKOUT_CACHE_CRITERIA') try: os.environ[ 'CHECKOUT_CACHE_CRITERIA'] = '[{"type":"application/json","max_size":12314}]' handle = Config.get_blobstore_handle(replica) src_key = generate_test_key() src_data = b'{"status":"valid"}' source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # write dummy file and upload to upload area with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "application/json") # upload file to DSS self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) metadata = handle.get_user_metadata(test_bucket, src_key) dst_key = ("blobs/" + ".".join([ metadata['hca-dss-sha256'], metadata['hca-dss-sha1'], metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c'] ])).lower() for wait_to_upload_into_checkout_bucket in range(30): try: # get uploaded blob key from the checkout bucket file_metadata = json.loads( handle.get(test_checkout_bucket, dst_key).decode("utf-8")) break except BlobNotFoundError: time.sleep(1) else: file_metadata = json.loads( handle.get(test_checkout_bucket, dst_key).decode("utf-8")) assert file_metadata[ "status"] == "valid" # the file exists in the checkout bucket finally: os.environ['CHECKOUT_CACHE_CRITERIA'] = stored_cache_criteria
def _list_checkout_bundle( replica: Replica, bundle_uuid: str, bundle_version: typing.Optional[str], ) -> typing.List[typing.Tuple[str, dict]]: """ Lists the contents of a bundle in checkout. :param replica: Cloud replica :param bundle_uuid: Bundle UUID :param bundle_version: Bundle version :return: List of checkout bundle contents """ handle = Config.get_blobstore_handle(replica) prefix = get_dst_bundle_prefix(bundle_uuid, bundle_version) return list(handle.list_v2(replica.checkout_bucket, prefix))
def delete(uuid: str, replica: str, json_request_body: dict, version: str = None): email = request.token_info['email'] if email not in ADMIN_USER_EMAILS: raise DSSException( requests.codes.forbidden, "forbidden", f"You can't delete bundles with these credentials!", ) uuid = uuid.lower() version = datetime_to_version_format( iso8601.parse_date(version)) if version else None tombstone_id = TombstoneID(uuid=uuid, version=version) bundle_prefix = tombstone_id.to_key_prefix() tombstone_object_data = _create_tombstone_data( email=email, reason=json_request_body.get('reason'), version=version, ) handle = Config.get_blobstore_handle(Replica[replica]) bucket = Replica[replica].bucket if test_object_exists(handle, bucket, bundle_prefix, test_type=ObjectTest.PREFIX): created, idempotent = _idempotent_save(handle, bucket, tombstone_id.to_key(), tombstone_object_data) if not idempotent: raise DSSException( requests.codes.conflict, f"bundle_tombstone_already_exists", f"bundle tombstone with UUID {uuid} and version {version} already exists", ) status_code = requests.codes.ok response_body = dict() # type: dict else: status_code = requests.codes.not_found response_body = dict(title="bundle not found") return jsonify(response_body), status_code
def mark_bundle_checkout_successful( execution_id: str, replica: Replica, sts_bucket: str, dst_bucket: str, dst_location: str, ): handle = Config.get_blobstore_handle(replica) data = { _STATUS_KEY: 'SUCCEEDED', _LOCATION_KEY: f"{replica.storage_schema}://{dst_bucket}/{dst_location}" } handle.upload_file_handle(sts_bucket, _bundle_checkout_status_key(execution_id), io.BytesIO(json.dumps(data).encode("utf-8")))
def _verify_checkout( replica: Replica, token: typing.Optional[str], file_metadata: dict, blob_path: str, ) -> typing.Tuple[str, bool]: cloud_handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) try: now = datetime.datetime.now(datetime.timezone.utc) creation_date = cloud_handle.get_creation_date(replica.checkout_bucket, blob_path) stale_after_date = creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS'])) expiration_date = (creation_date + datetime.timedelta(days=int(os.environ['DSS_BLOB_TTL_DAYS'])) - datetime.timedelta(hours=1)) if now < expiration_date: if now > stale_after_date: start_file_checkout(replica, blob_path) if hca_handle.verify_blob_checksum_from_dss_metadata(replica.checkout_bucket, blob_path, file_metadata): return "", True else: logger.error( f"Checksum verification failed for file {replica.checkout_bucket}/{blob_path}") except BlobNotFoundError: pass decoded_token: dict if token is None: execution_id = start_file_checkout(replica, blob_path) start_time = time.time() attempts = 0 decoded_token = { CheckoutTokenKeys.EXECUTION_ID: execution_id, CheckoutTokenKeys.START_TIME: start_time, CheckoutTokenKeys.ATTEMPTS: attempts } else: try: decoded_token = json.loads(token) decoded_token[CheckoutTokenKeys.ATTEMPTS] += 1 except (KeyError, ValueError) as ex: raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex) encoded_token = json.dumps(decoded_token) return encoded_token, False
def build_bundle_metadata_document(replica: Replica, key: str) -> dict: """ This returns a JSON document with bundle manifest and metadata files suitable for JMESPath filters. """ handle = Config.get_blobstore_handle(replica) manifest = json.loads(handle.get(replica.bucket, key).decode("utf-8")) fqid = BundleFQID.from_key(key) bundle_info = dict(uuid=fqid.uuid, version=fqid.version) if key.endswith(TOMBSTONE_SUFFIX): return dict(event_type="TOMBSTONE", bundle_info=bundle_info, **manifest) else: lock = threading.Lock() files: dict = defaultdict(list) def _read_file(file_metadata): blob_key = "blobs/{}.{}.{}.{}".format( file_metadata['sha256'], file_metadata['sha1'], file_metadata['s3-etag'], file_metadata['crc32c'], ) contents = handle.get(replica.bucket, blob_key).decode("utf-8") try: file_info = json.loads(contents) except json.decoder.JSONDecodeError: logging.info(f"{file_metadata['name']} not json decodable") else: # Modify name to avoid confusion with JMESPath syntax name = _dot_to_underscore_and_strip_numeric_suffix( file_metadata['name']) with lock: files[name].append(file_info) # TODO: Consider scaling parallelization with Lambda size with ThreadPoolExecutor(max_workers=4) as e: e.map(_read_file, [ file_metadata for file_metadata in manifest['files'] if file_metadata['content-type'].startswith("application/json") ]) return dict(event_type="CREATE", bundle_info=bundle_info, manifest=manifest, files=dict(files))
def walker_walk(self) -> None: columns = [] for replica, bucket, key, token in zip(self.replicas, self.buckets, self.row, self.tokens): replica = Replica[replica] if bucket is None: bucket = replica.bucket elif bucket != replica.bucket: logger.warning( f'Checking bucket {bucket} instead of default {replica.bucket} for replica {replica}.' ) handle = Config.get_blobstore_handle(replica) column: PagedIter = handle.list_v2(bucket, prefix='bundles/' + self.work_id, token=token, start_after_key=key) columns.append(column) diff = zipalign(columns=map(iter, columns), row=self.row) while self.shutdown_time < self.remaining_runtime(): try: row = next(diff) except StopIteration: logger.info("Finished checking replicas.") self._status = WalkerStatus.finished.name break else: for i, key in enumerate(row.norm()): replica = self.replicas[i] if key is None: logger.warning( f"Replica {replica} is missing {row.min}") self.work_result['missing'][i] += 1 else: logger.debug(f"Replica {replica} contains {key}") self.work_result['present'][i] += 1 self.row = row.values else: self.tokens = [column.token for column in columns] logger.debug('Not enough time left in lambda execution, exiting.')
def _walk(self) -> None: executor = ThreadPoolExecutor(len(DEFAULT_BACKENDS)) # We can't use executor as context manager because we don't want shutting it down to block try: backend = CompositeIndexBackend(executor, DEFAULT_BACKENDS, dryrun=self.dryrun, notify=self.notify, context=self._context) indexer_cls = Indexer.for_replica(Replica[self.replica]) indexer = indexer_cls(backend) handle = Config.get_blobstore_handle(Replica[self.replica]) default_bucket = Replica[self.replica].bucket if self.bucket != default_bucket: logger.warning(f'Indexing bucket {self.bucket} instead of default {default_bucket}.') blobs = handle.list_v2( self.bucket, prefix=f'bundles/{self.work_id}', start_after_key=self.marker, # type: ignore # Cannot determine type of 'marker' token=self.token # type: ignore # Cannot determine type of 'token' ) for key in blobs: # Timing out while recording paging info could cause an inconsistent paging state, leading to repeats # of large amounts of work. This can be avoided by checking for timeouts only during actual # re-indexing. timeout = self.remaining_runtime() - 10 # ten seconds of safety for letting lambda shut down if timeout < 10: # don't even try to index an item with less then 10 seconds left logger.warning(f'{self.work_id} timed out during reindex') return backend._timeout = timeout self.process_item(indexer, key) self.marker = blobs.start_after_key self.token = blobs.token else: self._status = WalkerStatus.finished.name finally: executor.shutdown(False)
def delete(uuid: str, replica: str, json_request_body: dict, version: str = None): email = security.get_token_email(request.token_info) if email not in ADMIN_USER_EMAILS: raise DSSForbiddenException( "You can't delete bundles with these credentials!") uuid = uuid.lower() tombstone_id = BundleTombstoneID(uuid=uuid, version=version) bundle_prefix = tombstone_id.to_key_prefix() tombstone_object_data = _create_tombstone_data( email=email, reason=json_request_body.get('reason'), version=version, ) handle = Config.get_blobstore_handle(Replica[replica]) if not test_object_exists(handle, Replica[replica].bucket, bundle_prefix, test_type=ObjectTest.PREFIX): raise DSSException(404, "not_found", "Cannot find bundle!") created, idempotent = idempotent_save( handle, Replica[replica].bucket, tombstone_id.to_key(), json.dumps(tombstone_object_data).encode("utf-8")) if not idempotent: raise DSSException( requests.codes.conflict, f"bundle_tombstone_already_exists", f"bundle tombstone with UUID {uuid} and version {version} already exists", ) return dict(), requests.codes.ok
def get( uuid: str, replica: str, per_page: int, version: str = None, directurls: bool = False, presignedurls: bool = False, token: str = None, start_at: int = 0, ): if directurls and presignedurls: raise DSSException( requests.codes.bad_request, "only_one_urltype", "only enable one of `directurls` or `presignedurls`") _replica = Replica[replica] bundle_metadata = get_bundle_manifest(uuid, _replica, version) if bundle_metadata is None: raise DSSException(404, "not_found", "Cannot find bundle!") if version is None: version = bundle_metadata[BundleMetadata.VERSION] if directurls or presignedurls: try: token, ready = verify_checkout(_replica, uuid, version, token) except TokenError as ex: raise DSSException(requests.codes.bad_request, "illegal_token", "Could not understand token", ex) except CheckoutError as ex: raise DSSException(requests.codes.server_error, "checkout_error", "Could not complete checkout", ex) if not ready: builder = UrlBuilder(request.url) builder.replace_query("token", token) response = redirect(str(builder), code=requests.codes.moved) headers = response.headers headers['Retry-After'] = RETRY_AFTER_INTERVAL return response all_files = bundle_metadata[BundleMetadata.FILES] link = None if len(all_files) - start_at > per_page: next_url = UrlBuilder(request.url) next_url.replace_query("start_at", str(start_at + per_page)) next_url.replace_query("version", version) next_url.replace_query("token", token) link = f"<{next_url}>; rel='next'" files = all_files[start_at:start_at + per_page] filesresponse = [] # type: typing.List[dict] for _file in files: file_version = { 'name': _file[BundleFileMetadata.NAME], 'content-type': _file[BundleFileMetadata.CONTENT_TYPE], 'size': _file[BundleFileMetadata.SIZE], 'uuid': _file[BundleFileMetadata.UUID], 'version': _file[BundleFileMetadata.VERSION], 'crc32c': _file[BundleFileMetadata.CRC32C], 's3_etag': _file[BundleFileMetadata.S3_ETAG], 'sha1': _file[BundleFileMetadata.SHA1], 'sha256': _file[BundleFileMetadata.SHA256], 'indexed': _file[BundleFileMetadata.INDEXED], } if directurls: file_version['url'] = str(UrlBuilder().set( scheme=_replica.storage_schema, netloc=_replica.checkout_bucket, path="{}/{}".format( get_dst_bundle_prefix( uuid, bundle_metadata[BundleMetadata.VERSION]), _file[BundleFileMetadata.NAME], ), )) elif presignedurls: handle = Config.get_blobstore_handle(_replica) file_version['url'] = handle.generate_presigned_GET_url( _replica.checkout_bucket, "{}/{}".format( get_dst_bundle_prefix( uuid, bundle_metadata[BundleMetadata.VERSION]), _file[BundleFileMetadata.NAME], ), ) filesresponse.append(file_version) response_body = dict(bundle=dict( uuid=uuid, version=bundle_metadata[BundleMetadata.VERSION], files=filesresponse, creator_uid=bundle_metadata[BundleMetadata.CREATOR_UID], )) if link is None: response = make_response(jsonify(response_body), requests.codes.ok) response.headers['X-OpenAPI-Pagination'] = 'false' else: response = make_response(jsonify(response_body), requests.codes.partial) response.headers['X-OpenAPI-Pagination'] = 'true' response.headers['Link'] = link response.headers['X-OpenAPI-Paginated-Content-Key'] = 'bundle.files' return response
def build_bundle_file_metadata(replica: Replica, user_supplied_files: dict): handle = Config.get_blobstore_handle(replica) time_left = nestedcontext.inject("time_left") # decode the list of files. files = [{ 'user_supplied_metadata': _file } for _file in user_supplied_files] def _get_file_metadata(_file): metadata_key = FileFQID( uuid=_file['user_supplied_metadata']['uuid'], version=_file['user_supplied_metadata']['version'], ).to_key() while True: try: file_metadata = handle.get(replica.bucket, metadata_key) except BlobNotFoundError: if time_left() > PUT_TIME_ALLOWANCE_SECONDS: time.sleep(1) else: break else: return json.loads(file_metadata) return None # TODO: Consider scaling parallelization with Lambda size with ThreadPoolExecutor(max_workers=20) as e: futures = { e.submit(_get_file_metadata, _file): _file for _file in files } for future in as_completed(futures): _file = futures[future] res = future.result() if res is not None: _file['file_metadata'] = res else: missing_file_user_metadata = _file['user_supplied_metadata'] raise DSSException( requests.codes.bad_request, "file_missing", f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}." ) return [{ BundleFileMetadata.NAME: _file['user_supplied_metadata']['name'], BundleFileMetadata.UUID: _file['user_supplied_metadata']['uuid'], BundleFileMetadata.VERSION: _file['user_supplied_metadata']['version'], BundleFileMetadata.CONTENT_TYPE: _file['file_metadata'][FileMetadata.CONTENT_TYPE], BundleFileMetadata.SIZE: _file['file_metadata'][FileMetadata.SIZE], BundleFileMetadata.INDEXED: _file['user_supplied_metadata']['indexed'], BundleFileMetadata.CRC32C: _file['file_metadata'][FileMetadata.CRC32C], BundleFileMetadata.S3_ETAG: _file['file_metadata'][FileMetadata.S3_ETAG], BundleFileMetadata.SHA1: _file['file_metadata'][FileMetadata.SHA1], BundleFileMetadata.SHA256: _file['file_metadata'][FileMetadata.SHA256], } for _file in files]
def get_bundle_checkout_status(execution_id: str, replica: Replica, sts_bucket: str): handle = Config.get_blobstore_handle(replica) return json.loads( handle.get(sts_bucket, _bundle_checkout_status_key(execution_id)))
def validate_dst_bucket(replica: Replica, dst_bucket: str) -> bool: if not Config.get_blobstore_handle(replica).check_bucket_exists(dst_bucket): raise DestinationBucketNotFoundError(f"Bucket {dst_bucket} doesn't exist") return touch_test_file(replica, dst_bucket)
def put(uuid: str, json_request_body: dict, version: str = None): class CopyMode(Enum): NO_COPY = auto() COPY_INLINE = auto() COPY_ASYNC = auto() uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) source_url = json_request_body['source_url'] cre = re.compile("^" "(?P<schema>(?:s3|gs|wasb))" "://" "(?P<bucket>[^/]+)" "/" "(?P<key>.+)" "$") mobj = cre.match(source_url) if mobj and mobj.group('schema') == "s3": replica = Replica.aws elif mobj and mobj.group('schema') == "gs": replica = Replica.gcp else: schema = mobj.group('schema') raise DSSException(requests.codes.bad_request, "unknown_source_schema", f"source_url schema {schema} not supported") handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) dst_bucket = replica.bucket src_bucket = mobj.group('bucket') src_key = mobj.group('key') metadata = handle.get_user_metadata(src_bucket, src_key) size = handle.get_size(src_bucket, src_key) content_type = handle.get_content_type(src_bucket, src_key) # format all the checksums so they're lower-case. for metadata_spec in HCABlobStore.MANDATORY_METADATA.values(): if metadata_spec['downcase']: keyname = typing.cast(str, metadata_spec['keyname']) metadata[keyname] = metadata[keyname].lower() # what's the target object name for the actual data? dst_key = ("blobs/" + ".".join(( metadata['hca-dss-sha256'], metadata['hca-dss-sha1'], metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c'], ))).lower() # does it exist? if so, we can skip the copy part. copy_mode = CopyMode.COPY_INLINE try: if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata): copy_mode = CopyMode.NO_COPY except BlobNotFoundError: pass # build the json document for the file metadata. file_metadata = { FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION, FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'], FileMetadata.CREATOR_UID: json_request_body['creator_uid'], FileMetadata.VERSION: version, FileMetadata.CONTENT_TYPE: content_type, FileMetadata.SIZE: size, FileMetadata.CRC32C: metadata['hca-dss-crc32c'], FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'], FileMetadata.SHA1: metadata['hca-dss-sha1'], FileMetadata.SHA256: metadata['hca-dss-sha256'], } file_metadata_json = json.dumps(file_metadata) if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD: copy_mode = CopyMode.COPY_ASYNC if copy_mode == CopyMode.COPY_ASYNC: if replica == Replica.aws: state = s3copyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}" elif replica == Replica.gcp: state = gscopyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}" else: raise ValueError("Unhandled replica") execution_id = str(uuid4()) stepfunctions.step_functions_invoke(state_machine_name_template, execution_id, state) return jsonify(dict(task_id=execution_id, version=version)), requests.codes.accepted elif copy_mode == CopyMode.COPY_INLINE: handle.copy(src_bucket, src_key, dst_bucket, dst_key) # verify the copy was done correctly. assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata) try: write_file_metadata(handle, dst_bucket, uuid, version, file_metadata_json) status_code = requests.codes.created except BlobAlreadyExistsError: # fetch the file metadata, compare it to what we have. existing_file_metadata = json.loads( handle.get(dst_bucket, "files/{}.{}".format(uuid, version)).decode("utf-8")) if existing_file_metadata != file_metadata: raise DSSException( requests.codes.conflict, "file_already_exists", f"file with UUID {uuid} and version {version} already exists") status_code = requests.codes.ok return jsonify(dict(version=version)), status_code
def _test_bundle_put(self, replica: Replica, fixtures_bucket: str): schema = replica.storage_schema bundle_uuid = str(uuid.uuid4()) file_uuid = str(uuid.uuid4()) missing_file_uuid = str(uuid.uuid4()) resp_obj = self.upload_file_wait( f"{schema}://{fixtures_bucket}/test_good_source_data/0", replica, file_uuid, bundle_uuid=bundle_uuid, ) file_version = resp_obj.json['version'] # first bundle. bundle_version = datetime_to_version_format(datetime.datetime.utcnow()) self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE")], bundle_version, ) # should be able to do this twice (i.e., same payload, same UUIDs) self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE")], bundle_version, requests.codes.ok, ) # should *NOT* be able to do this twice with different payload. self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE1")], bundle_version, requests.codes.conflict, ) # should *NOT* be able to upload a bundle with a missing file, but we should get requests.codes.conflict. with nestedcontext.bind(time_left=lambda: 0): resp_obj = self.put_bundle( replica, bundle_uuid, [ (file_uuid, file_version, "LICENSE0"), (missing_file_uuid, file_version, "LICENSE1"), ], expected_code=requests.codes.conflict, ) self.assertEqual(resp_obj.json['code'], "file_missing") # should *NOT* be able to upload a bundle containing a file with an incorrect bundle_uuid # but we should get requests.codes.conflict with nestedcontext.bind(time_left=lambda: 0): resp_obj = self.put_bundle( replica, str(uuid.uuid4()), # uploading new bundle with old file [(file_uuid, file_version, "LICENSE")], datetime_to_version_format(datetime.datetime.utcnow()), expected_code=requests.codes.conflict, ) self.assertEqual(resp_obj.json['code'], "incorrect_file_bundle_uuid") # uploads a file, but delete the file metadata. put it back after a delay. self.upload_file_wait( f"{schema}://{fixtures_bucket}/test_good_source_data/0", replica, missing_file_uuid, file_version, bundle_uuid=bundle_uuid ) handle = Config.get_blobstore_handle(replica) bucket = replica.bucket file_metadata = handle.get(bucket, f"files/{missing_file_uuid}.{file_version}") handle.delete(bucket, f"files/{missing_file_uuid}.{file_version}") class UploadThread(threading.Thread): def run(innerself): time.sleep(5) data_fh = io.BytesIO(file_metadata) handle.upload_file_handle(bucket, f"files/{missing_file_uuid}.{file_version}", data_fh) # start the upload (on a delay...) upload_thread = UploadThread() upload_thread.start() # this should at first fail to find one of the files, but the UploadThread will eventually upload the file # metadata. since we give the upload bundle process ample time to spin, it should eventually find the file # metadata and succeed. with nestedcontext.bind(time_left=lambda: sys.maxsize): self.put_bundle( replica, bundle_uuid, [ (file_uuid, file_version, "LICENSE0"), (missing_file_uuid, file_version, "LICENSE1"), ], expected_code=requests.codes.created, )
def _test_file_get_checkout(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader): handle = Config.get_blobstore_handle(replica) src_key = generate_test_key() src_data = os.urandom(1024) source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # write dummy file and upload to upload area with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "text/plain") # upload file to DSS self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query( "replica", replica.name).add_query("version", version)) # get uploaded blob key file_metadata = json.loads( handle.get(test_bucket, f"files/{file_uuid}.{version}").decode("utf-8")) file_key = compose_blob_key(file_metadata) @eventually(20, 1) def test_checkout(): # assert 302 and verify checksum on checkout completion api_get = self.assertGetResponse(url, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=0) file_get = requests.get(api_get.response.headers['Location']) self.assertTrue(file_get.ok) self.assertEquals(file_get.content, src_data) with self.subTest( f"{replica}: Initiates checkout and returns 301 for GET on 'uncheckedout' file." ): # assert 301 redirect on first GET self.assertGetResponse(url, requests.codes.moved, headers=get_auth_header(), redirect_follow_retries=0) test_checkout() with self.subTest( f"{replica}: Initiates checkout and returns 301 for GET on nearly expired checkout file." ): now = datetime.datetime.now(datetime.timezone.utc) creation_date_fn = ( "cloud_blobstore.s3.S3BlobStore.get_creation_date" if replica.name == "aws" else "cloud_blobstore.gs.GSBlobStore.get_creation_date") with mock.patch(creation_date_fn) as mock_creation_date: blob_ttl_days = int(os.environ['DSS_BLOB_TTL_DAYS']) mock_creation_date.return_value = now - datetime.timedelta( days=blob_ttl_days, hours=1, minutes=5) self.assertGetResponse(url, requests.codes.moved, headers=get_auth_header(), redirect_follow_retries=0) test_checkout() with self.subTest( f"{replica}: Initiates checkout and returns 302 immediately for GET on stale checkout file." ): now = datetime.datetime.now(datetime.timezone.utc) creation_date = handle.get_creation_date(replica.checkout_bucket, file_key) creation_date_fn = ( "cloud_blobstore.s3.S3BlobStore.get_creation_date" if replica.name == "aws" else "cloud_blobstore.gs.GSBlobStore.get_creation_date") with mock.patch(creation_date_fn) as mock_creation_date: # assert 302 found on stale file and that last modified refreshes blob_ttl_days = int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS']) mock_creation_date.return_value = now - datetime.timedelta( days=blob_ttl_days + 1) self.assertGetResponse(url, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=0) self.assertTrue( creation_date > handle.get_creation_date( replica.checkout_bucket, file_key), f'\ncurr_creation_date: {creation_date}' f'\nprev_creation_date: {handle.get_creation_date(replica.checkout_bucket)}' ) handle.delete(test_bucket, f"files/{file_uuid}.{version}") handle.delete(replica.checkout_bucket, file_key)
def _test_file_get_direct(self, replica: Replica): """ Verify that the direct URL option works for GET/ file """ file_uuid = "ce55fd51-7833-469b-be0b-5da88ebebfcd" handle = Config.get_blobstore_handle(replica) direct_url_req = str( UrlBuilder().set(path="/v1/files/" + file_uuid).add_query( "replica", replica.name).add_query("directurl", "True")) presigned_url_req = str(UrlBuilder().set( path="/v1/files/" + file_uuid).add_query("replica", replica.name)) with override_bucket_config(BucketConfig.TEST_FIXTURE): native_resp_obj = self.assertGetResponse( direct_url_req, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=FILE_GET_RETRY_COUNT, min_retry_interval_header=RETRY_AFTER_INTERVAL, override_retry_interval=1, ) resp_obj = self.assertGetResponse( presigned_url_req, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=FILE_GET_RETRY_COUNT, min_retry_interval_header=RETRY_AFTER_INTERVAL, override_retry_interval=1, ) verify_headers = [ 'X-DSS-VERSION', 'X-DSS-CREATOR-UID', 'X-DSS-S3-ETAG', 'X-DSS-SHA256', 'X-DSS-SHA1', 'X-DSS-CRC32C' ] native_headers_verify = { k: v for k, v in native_resp_obj.response.headers.items() if k in verify_headers } presigned_headers_verify = { k: v for k, v in resp_obj.response.headers.items() if k in verify_headers } self.assertDictEqual(native_headers_verify, presigned_headers_verify) with self.subTest( 'Retry-After headers are not included in a successful response.' ): self.assertEqual( native_resp_obj.response.headers.get('Retry-After'), None) self.assertTrue(native_resp_obj.response.headers['Location'].split( '//')[0].startswith(replica.storage_schema)) self.assertTrue(native_resp_obj.response.headers['Location'].split( '//')[1].startswith(replica.checkout_bucket)) blob_path = native_resp_obj.response.headers['Location'].split( '/blobs/')[1] native_size = handle.get_size(replica.checkout_bucket, f'blobs/{blob_path}') self.assertGreater(native_size, 0) self.assertEqual(native_size, int(resp_obj.response.headers['X-DSS-SIZE']))