def _test_file_get_invalid_token(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader): src_key = generate_test_key() src_data = os.urandom(1024) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "text/plain") source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # should be able to do this twice (i.e., same payload, different UUIDs) self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query( "replica", replica.name).add_query("version", version).add_query("token", "{}")) @eventually(30, 0.1) def try_get(): self.assertGetResponse(url, requests.codes.bad_request, headers=get_auth_header()) try_get()
def upload_file(self, contents): s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") src_key = generate_test_key() s3 = boto3.resource('s3') with io.BytesIO(json.dumps( contents).encode()) as fh, ChecksummingSink() as sink: sink.write(fh.read()) sums = sink.get_checksums() metadata = { 'hca-dss-crc32c': sums['crc32c'].lower(), 'hca-dss-s3_etag': sums['s3_etag'].lower(), 'hca-dss-sha1': sums['sha1'].lower(), 'hca-dss-sha256': sums['sha256'].lower() } fh.seek(0) # TODO: consider switching to unmanaged uploader (putobject w/blob) s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj( fh, ExtraArgs={"Metadata": metadata}) source_url = f"s3://{s3_test_bucket}/{src_key}" file_uuid = str(uuid4()) version = datetime_to_version_format(datetime.utcnow()) urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid) urlbuilder.add_query("version", version) resp_obj = self.assertPutResponse(str(urlbuilder), requests.codes.created, json_request_body=dict( creator_uid=0, source_url=source_url)) return file_uuid, resp_obj.json["version"]
def _test_bundle_delete(self, replica: Replica, fixtures_bucket: str, authorized: bool): schema = replica.storage_schema # prep existing bundle bundle_uuid = str(uuid.uuid4()) file_uuid = str(uuid.uuid4()) resp_obj = self.upload_file_wait( f"{schema}://{fixtures_bucket}/test_good_source_data/0", replica, file_uuid, bundle_uuid=bundle_uuid, ) file_version = resp_obj.json['version'] bundle_version = datetime_to_version_format(datetime.datetime.utcnow()) self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE")], bundle_version, ) handle = Config.get_blobstore_handle(replica) bucket = replica.bucket self.delete_bundle(replica, bundle_uuid, authorized=authorized) tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.dead") self.assertEquals(tombstone_exists, authorized) self.delete_bundle(replica, bundle_uuid, bundle_version, authorized=authorized) tombstone_exists = test_object_exists(handle, bucket, f"bundles/{bundle_uuid}.{bundle_version}.dead") self.assertEquals(tombstone_exists, authorized)
def _upload_bundle(self, replica, uuid=None): if replica == Replica.aws: test_fixtures_bucket = get_env('DSS_S3_BUCKET_TEST_FIXTURES') else: test_fixtures_bucket = get_env('DSS_GS_BUCKET_TEST_FIXTURES') bundle_uuid = uuid if uuid else str(uuid4()) file_uuid_1 = str(uuid4()) file_uuid_2 = str(uuid4()) filenames = ["file_1", "file_2"] resp_obj_1 = self.upload_file_wait( f"{replica.storage_schema}://{test_fixtures_bucket}/test_good_source_data/0", replica, file_uuid_1, bundle_uuid=bundle_uuid, ) resp_obj_2 = self.upload_file_wait( f"{replica.storage_schema}://{test_fixtures_bucket}/test_good_source_data/1", replica, file_uuid_2, bundle_uuid=bundle_uuid, ) file_version_1 = resp_obj_1.json['version'] file_version_2 = resp_obj_2.json['version'] bundle_version = datetime_to_version_format(datetime.datetime.utcnow()) self.put_bundle( replica, bundle_uuid, [(file_uuid_1, file_version_1, filenames[0]), (file_uuid_2, file_version_2, filenames[1])], bundle_version, ) return bundle_uuid, bundle_version
def _test_file_put(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader): src_key = generate_test_key() src_data = os.urandom(1024) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "text/plain") source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # should be able to do this twice (i.e., same payload, different UUIDs) self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) self.upload_file(source_url, str(uuid.uuid4())) # should be able to do this twice (i.e., same payload, same UUIDs) self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version, expected_code=requests.codes.ok) # should *NOT* be able to do this twice (i.e., different payload, same UUIDs) self.upload_file(source_url, file_uuid, version=version, expected_code=requests.codes.conflict)
def _put(self, contents: typing.List, authorized: bool = True, uuid: typing.Optional[str] = None, version: typing.Optional[str] = None, replica: str = 'aws') -> typing.Tuple[str, str]: uuid = str(uuid4()) if uuid is None else uuid version = datetime_to_version_format( datetime.now()) if version is None else version params = dict() if uuid != 'missing': params['uuid'] = uuid if version != 'missing': params['version'] = version if replica != 'missing': params['replica'] = replica res = self.app.put("/v1/collections", headers=get_auth_header(authorized=authorized), params=params, json=dict(name="n", description="d", details={}, contents=contents)) return res.json()["uuid"], res.json()["version"]
def patch(uuid: str, json_request_body: dict, replica: str, version: str): bundle = get_bundle_manifest(uuid, Replica[replica], version) if bundle is None: raise DSSException(404, "not_found", "Could not find bundle for UUID {}".format(uuid)) remove_files_set = { bundle_file_id_metadata(f) for f in json_request_body.get("remove_files", []) } bundle['files'] = [ f for f in bundle['files'] if bundle_file_id_metadata(f) not in remove_files_set ] add_files = json_request_body.get("add_files", []) bundle['files'].extend( build_bundle_file_metadata(Replica[replica], add_files)) detect_filename_collisions(bundle['files']) timestamp = datetime.datetime.utcnow() new_bundle_version = datetime_to_version_format(timestamp) bundle['version'] = new_bundle_version _save_bundle(Replica[replica], uuid, new_bundle_version, bundle) return jsonify(dict(uuid=uuid, version=new_bundle_version)), requests.codes.ok
def __init__(self, handle: BlobStore, path: str, bucket: str, replica: Replica = Replica.aws, bundle_uuid: str = None) -> None: self.path = path self.uuid = bundle_uuid if bundle_uuid else str(uuid.uuid4()) self.version = datetime_to_version_format(datetime.datetime.utcnow()) self.handle = handle self.bucket = bucket self.files = self.enumerate_bundle_files(replica)
def upload_file_wait( self: typing.Any, source_url: str, replica: Replica, file_uuid: str = None, file_version: str = None, bundle_uuid: str = None, timeout_seconds: int = 120, expect_async: typing.Optional[bool] = None, ) -> DSSAssertResponse: """ Upload a file. If the request is being handled asynchronously, wait until the file has landed in the data store. """ file_uuid = str(uuid.uuid4()) if file_uuid is None else file_uuid bundle_uuid = str(uuid.uuid4()) if bundle_uuid is None else bundle_uuid if expect_async is True: expected_codes = requests.codes.accepted elif expect_async is False: expected_codes = requests.codes.created else: expected_codes = requests.codes.created, requests.codes.accepted if file_version is None: timestamp = datetime.datetime.utcnow() file_version = datetime_to_version_format(timestamp) url = UrlBuilder().set(path=f"/v1/files/{file_uuid}") url.add_query("version", file_version) resp_obj = self.assertPutResponse( str(url), expected_codes, json_request_body=dict( bundle_uuid=bundle_uuid, creator_uid=0, source_url=source_url, ), ) if resp_obj.response.status_code == requests.codes.accepted: # hit the GET /files endpoint until we succeed. start_time = time.time() timeout_time = start_time + timeout_seconds while time.time() < timeout_time: try: self.assertHeadResponse( f"/v1/files/{file_uuid}?replica={replica.name}", requests.codes.ok) break except AssertionError: pass time.sleep(1) else: self.fail("Could not find the output file") return resp_obj
def enumerate(replica: str, prefix: typing.Optional[str] = None, token: typing.Optional[str] = None, per_page: int = PerPageBounds.per_page_max, search_after: typing.Optional[str] = None): """ :param replica: replica name to enumerate against :param prefix: uuid prefix used to filter enumeration :param token: used to page searches, should not be set by the user. :param per_page: max items per page to show, 10 <= per_page <= 500 :param search_after: used to page searches, should not be set by the user. """ if prefix: search_prefix = f'{BUNDLE_PREFIX}/{prefix.lower()}' else: search_prefix = f'{BUNDLE_PREFIX}/' api_domain_name = f'https://{os.environ.get("API_DOMAIN_NAME")}' payload = dict(dss_api=api_domain_name, object='list', per_page=per_page, search_prefix=search_prefix, event_timestamp=datetime_to_version_format( datetime.datetime.utcnow())) # type: typing.Any kwargs = dict(replica=Replica[replica].name, prefix=search_prefix, per_page=per_page) if search_after: kwargs['search_after'] = search_after if token: kwargs['token'] = token payload.update(enumerate_available_bundles(**kwargs)) # type: ignore if payload['token'] is None: payload['token'] = "" if payload['page_count'] < per_page: # enumeration is complete payload.update(dict(has_more=False)) del payload['token'] del payload['search_after'] response = make_response(jsonify(payload), requests.codes.ok) response.headers['X-OpenAPI-Pagination'] = 'false' else: next_url = UrlBuilder(request.url) next_url.replace_query("search_after", payload['search_after']) next_url.replace_query("token", payload['token']) link = f"<{next_url}>; rel='next'" payload.update( dict(has_more=True, token=payload['token'], link=f'{next_url}')) response = make_response(jsonify(payload), requests.codes.partial) response.headers['Link'] = link response.headers['X-OpenAPI-Pagination'] = 'true' response.headers['X-OpenAPI-Paginated-Content-Key'] = 'bundles' return response
def put_bundles_reponse(self, path, replica, expected_code): """ Uploads a file from fixtures to the dss, and then adds it to a bundle with the 'path' name. Asserts expected codes were received at each point. """ fixtures_bucket = self.get_test_fixture_bucket( replica.name) # source a file to upload file_version = datetime_to_version_format(datetime.datetime.utcnow()) bundle_version = datetime_to_version_format(datetime.datetime.utcnow()) bundle_uuid = str(uuid.uuid4()) file_uuid = str(uuid.uuid4()) storage_schema = 's3' if replica.name == 'aws' else 'gs' # upload a file from test fixtures self.upload_file_wait( f"{storage_schema}://{fixtures_bucket}/test_good_source_data/0", replica, file_uuid, file_version=file_version, bundle_uuid=bundle_uuid) # add that file to a bundle builder = UrlBuilder().set(path="/v1/bundles/" + bundle_uuid) builder.add_query("replica", replica.name) builder.add_query("version", bundle_version) url = str(builder) self.assertPutResponse(url, expected_code, json_request_body=dict( files=[ dict(uuid=file_uuid, version=file_version, name=path, indexed=False) ], creator_uid=0, ), headers=get_auth_header())
def _upload_bundle(app, replica, uuid=None): files = list() test_fixtures_bucket = os.environ['DSS_GS_BUCKET_TEST_FIXTURES'] for i in range(2): file_name = f"file_{i}" file_uuid, file_version = str(uuid4()), datetime_to_version_format(datetime.utcnow()) source_url = f"{replica.storage_schema}://{test_fixtures_bucket}/test_good_source_data/0" resp = app.put(f"/v1/files/{file_uuid}?version={file_version}", headers={** get_auth_header(), ** {'Content-Type': "application/json"}}, json=dict(creator_uid=0, source_url=source_url)) resp.raise_for_status() files.append((file_uuid, file_version, file_name)) bundle_uuid, bundle_version = str(uuid4()), datetime_to_version_format(datetime.utcnow()) json_request_body = dict(creator_uid=0, files=[dict(uuid=file_uuid, version=file_version, name=file_name, indexed=False) for file_uuid, file_version, file_name in files]) resp = app.put(f"/v1/bundles/{bundle_uuid}?replica={replica.name}&version={bundle_version}", headers={** get_auth_header(), ** {'Content-Type': "application/json"}}, json=json_request_body) resp.raise_for_status() resp = app.get(f"/v1/bundles/{bundle_uuid}?replica={replica.name}&version={bundle_version}") return bundle_uuid, bundle_version
class Iterator: keys = [BundleFQID(uuid=uuid.uuid4(), version=datetime_to_version_format(datetime.datetime.utcnow())).to_key() for i in range(10)] def __init__(self, *args, **kwargs): self.start_after_key = None self.token = 'frank' def __iter__(self): for key in self.keys: self.start_after_key = key yield self.start_after_key
def _test_file_put_cached(self, replica: Replica, scheme: str, test_bucket: str, test_checkout_bucket: str, uploader: Uploader): stored_cache_criteria = os.environ.get('CHECKOUT_CACHE_CRITERIA') try: os.environ[ 'CHECKOUT_CACHE_CRITERIA'] = '[{"type":"application/json","max_size":12314}]' handle = Config.get_blobstore_handle(replica) src_key = generate_test_key() src_data = b'{"status":"valid"}' source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # write dummy file and upload to upload area with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "application/json") # upload file to DSS self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) metadata = handle.get_user_metadata(test_bucket, src_key) dst_key = ("blobs/" + ".".join([ metadata['hca-dss-sha256'], metadata['hca-dss-sha1'], metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c'] ])).lower() for wait_to_upload_into_checkout_bucket in range(30): try: # get uploaded blob key from the checkout bucket file_metadata = json.loads( handle.get(test_checkout_bucket, dst_key).decode("utf-8")) break except BlobNotFoundError: time.sleep(1) else: file_metadata = json.loads( handle.get(test_checkout_bucket, dst_key).decode("utf-8")) assert file_metadata[ "status"] == "valid" # the file exists in the checkout bucket finally: os.environ['CHECKOUT_CACHE_CRITERIA'] = stored_cache_criteria
def test_put_invalid_fragment(self): """PUT invalid fragment reference.""" uuid = str(uuid4()) self.addCleanup(self._delete_collection, uuid) res = self.app.put("/v1/collections", headers=get_auth_header(authorized=True), params=dict(uuid=uuid, version=datetime_to_version_format( datetime.now()), replica="aws"), json=dict(name="n", description="d", details={}, contents=[self.invalid_ptr] * 128)) self.assertEqual(res.status_code, requests.codes.unprocessable_entity)
def delete(uuid: str, replica: str, json_request_body: dict, version: str = None): email = request.token_info['email'] if email not in ADMIN_USER_EMAILS: raise DSSException( requests.codes.forbidden, "forbidden", f"You can't delete bundles with these credentials!", ) uuid = uuid.lower() version = datetime_to_version_format( iso8601.parse_date(version)) if version else None tombstone_id = TombstoneID(uuid=uuid, version=version) bundle_prefix = tombstone_id.to_key_prefix() tombstone_object_data = _create_tombstone_data( email=email, reason=json_request_body.get('reason'), version=version, ) handle = Config.get_blobstore_handle(Replica[replica]) bucket = Replica[replica].bucket if test_object_exists(handle, bucket, bundle_prefix, test_type=ObjectTest.PREFIX): created, idempotent = _idempotent_save(handle, bucket, tombstone_id.to_key(), tombstone_object_data) if not idempotent: raise DSSException( requests.codes.conflict, f"bundle_tombstone_already_exists", f"bundle tombstone with UUID {uuid} and version {version} already exists", ) status_code = requests.codes.ok response_body = dict() # type: dict else: status_code = requests.codes.not_found response_body = dict(title="bundle not found") return jsonify(response_body), status_code
def test_502_get_bundle_HAS_retry_after_response(self): """Mock seems resistant to multiple calls, therefore this is only used for one endpoint.""" with mock.patch('dss.api.bundles.get', side_effect=DSSException(502, 'bad_gateway', "Bad Gateway")): self.app = ThreadedLocalServer() self.app.start() uuid = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" version = datetime_to_version_format(datetime.datetime.utcnow()) url = str(UrlBuilder().set(path=f"/v1/bundles/{uuid}").add_query( "version", version).add_query("replica", 'aws')) r = self.assertGetResponse(url, 502, headers=get_auth_header()) self.assertEqual(int(r.response.headers['Retry-After']), 10) self.app.shutdown()
def upload_file(app, contents, replica): src_key = generate_test_key() encoded = json.dumps(contents).encode() chunk_size = get_s3_multipart_chunk_size(len(encoded)) with io.BytesIO(encoded) as fh, ChecksummingSink( write_chunk_size=chunk_size) as sink: sink.write(fh.read()) sums = sink.get_checksums() metadata = { 'hca-dss-crc32c': sums['crc32c'].lower(), 'hca-dss-s3_etag': sums['s3_etag'].lower(), 'hca-dss-sha1': sums['sha1'].lower(), 'hca-dss-sha256': sums['sha256'].lower() } fh.seek(0) if replica == 'gcp': gs_test_bucket = get_env("DSS_GS_BUCKET_TEST") gcp_client = gs_storage.Client.from_service_account_json( os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) gs_bucket = gcp_client.bucket(gs_test_bucket) blob = gs_bucket.blob(src_key) blob.upload_from_file(fh, content_type="application/json") blob.metadata = metadata blob.patch() source_url = f"gs://{gs_test_bucket}/{src_key}" if replica == 'aws': # TODO: consider switching to unmanaged uploader (putobject w/blob) s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") s3 = boto3.resource('s3') s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj( fh, ExtraArgs={"Metadata": metadata}) source_url = f"s3://{s3_test_bucket}/{src_key}" file_uuid = str(uuid4()) version = datetime_to_version_format(datetime.utcnow()) urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid) urlbuilder.add_query("version", version) resp_obj = app.put(str(urlbuilder), json=dict(creator_uid=0, source_url=source_url), headers=get_auth_header()) resp_obj.raise_for_status() return file_uuid, resp_obj.json()["version"]
def test_file_put_large_incorrect_s3_etag(self) -> None: bucket = self.s3_test_bucket src_key = generate_test_key() src_data = os.urandom(ASYNC_COPY_THRESHOLD + 1) # upload file with incompatible s3 part size self._upload_file_to_mock_ingest(S3Uploader, bucket, src_key, src_data, s3_part_size=6 * 1024 * 1024) file_uuid = str(uuid.uuid4()) timestamp = datetime.datetime.utcnow() file_version = datetime_to_version_format(timestamp) url = UrlBuilder().set(path=f"/v1/files/{file_uuid}") url.add_query("version", file_version) source_url = f"s3://{bucket}/{src_key}" # put file into DSS, starting an async copy which will fail expected_codes = requests.codes.accepted, self.assertPutResponse(str(url), expected_codes, json_request_body=dict( file_uuid=file_uuid, creator_uid=0, source_url=source_url, ), headers=get_auth_header()) # should eventually get unprocessable after async copy fails @eventually(120, 1) def tryHead(): self.assertHeadResponse( f"/v1/files/{file_uuid}?replica=aws&version={file_version}", requests.codes.unprocessable) tryHead() # should get unprocessable on GCP too self.assertHeadResponse( f"/v1/files/{file_uuid}?replica=gcp&version={file_version}", requests.codes.unprocessable)
def patch(uuid: str, json_request_body: dict, replica: str, version: str): authenticated_user_email = security.get_token_email(request.token_info) uuid = uuid.lower() owner = get_impl(uuid=uuid, replica=replica)["owner"] if owner != authenticated_user_email: raise DSSException(requests.codes.forbidden, "forbidden", f"Collection access denied") handle = Config.get_blobstore_handle(Replica[replica]) try: cur_collection_blob = handle.get( Replica[replica].bucket, CollectionFQID(uuid, version).to_key()) except BlobNotFoundError: raise DSSException( 404, "not_found", "Could not find collection for UUID {}".format(uuid)) collection = json.loads(cur_collection_blob) for field in "name", "description", "details": if field in json_request_body: collection[field] = json_request_body[field] remove_contents_set = set( map(hashabledict, json_request_body.get("remove_contents", []))) collection["contents"] = [ i for i in collection["contents"] if hashabledict(i) not in remove_contents_set ] verify_collection(json_request_body.get("add_contents", []), Replica[replica], handle) collection["contents"].extend(json_request_body.get("add_contents", [])) collection["contents"] = _dedpuplicate_contents(collection["contents"]) timestamp = datetime.datetime.utcnow() new_collection_version = datetime_to_version_format(timestamp) handle.upload_file_handle( Replica[replica].bucket, CollectionFQID(uuid, new_collection_version).to_key(), io.BytesIO(json.dumps(collection).encode("utf-8"))) return jsonify(dict(uuid=uuid, version=new_collection_version)), requests.codes.ok
def test_file_put_bad_checksum(self): src_key = generate_test_key() uploader = S3Uploader(tempfile.gettempdir(), self.s3_test_bucket) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(os.urandom(1024)) fh.flush() uploader.upload_file(fh.name, src_key, 'text/plain', metadata_keys=self.bad_checksums) source_url = f's3://{self.s3_test_bucket}/{src_key}' file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # catch AssertionError raised when upload returns 422 instead of 201 with self.assertRaises(AssertionError): r = self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) self.assertEqual(r.json['code'], 'invalid_checksum')
def is_DSS_VERSION(val): """ Verifies `val` is compliant with expected format. See for more info on connexion custom type formats https://connexion.readthedocs.io/en/latest/cookbook.html#custom-type-format. :param val: the value to verify :return: the verified value """ from iso8601 import iso8601 # convert it to date-time so we can format exactly as the system requires (with microsecond precision) try: timestamp = iso8601.parse_date(val) except iso8601.ParseError: raise DSSException( requests.codes.bad_request, "illegal_version", f"version should be an RFC3339 compliant timestamp") timestamp = datetime_to_version_format(timestamp) if timestamp != val: raise DSSException( requests.codes.bad_request, "illegal_version", f"version should be a DSS_VERSION with the format 'YYYY-MM-DDTHHmmSS.zzzzzzZ'" ) return val
def test_regex_patterns(self): version = datetime_to_version_format(datetime.datetime.utcnow()) key = f"bundles/{uuid4()}.{version}" tombstone_key_with_version = key + ".dead" tombstone_key_without_version = f"bundles/{uuid4()}.dead" self.assertIsNone(notify_v2._versioned_tombstone_key_regex.match(key)) self.assertIsNone( notify_v2._unversioned_tombstone_key_regex.match(key)) self.assertIsNotNone( notify_v2._versioned_tombstone_key_regex.match( tombstone_key_with_version)) self.assertIsNone( notify_v2._versioned_tombstone_key_regex.match( tombstone_key_without_version)) self.assertIsNone( notify_v2._unversioned_tombstone_key_regex.match( tombstone_key_with_version)) self.assertIsNotNone( notify_v2._unversioned_tombstone_key_regex.match( tombstone_key_without_version))
def get_version(): return datetime_to_version_format(datetime.datetime.utcnow())
def put(uuid: str, replica: str, json_request_body: dict, version: str = None): uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) handle = Config.get_blobstore_handle(Replica[replica]) bucket = Replica[replica].bucket # what's the target object name for the bundle manifest? bundle_manifest_key = BundleFQID(uuid=uuid, version=version).to_key() # decode the list of files. files = [{'user_supplied_metadata': file} for file in json_request_body['files']] time_left = nestedcontext.inject("time_left") while True: # each time through the outer while-loop, we try to gather up all the file metadata. for file in files: user_supplied_metadata = file['user_supplied_metadata'] metadata_key = FileFQID( uuid=user_supplied_metadata['uuid'], version=user_supplied_metadata['version'], ).to_key() if 'file_metadata' not in file: try: file_metadata = handle.get(bucket, metadata_key) except BlobNotFoundError: continue file['file_metadata'] = json.loads(file_metadata) if uuid != file['file_metadata']['bundle_uuid']: raise DSSException( requests.codes.conflict, "incorrect_file_bundle_uuid", f"File bundle_uuid {file['file_metadata']['bundle_uuid']} does not equal bundle uuid {uuid}" ) # check to see if any file metadata is still not yet loaded. for file in files: if 'file_metadata' not in file: missing_file_user_metadata = file['user_supplied_metadata'] break else: break # if we're out of time, give up. if time_left() > PUT_TIME_ALLOWANCE_SECONDS: time.sleep(1) continue raise DSSException( requests.codes.conflict, "file_missing", f"Could not find file {missing_file_user_metadata['uuid']}/{missing_file_user_metadata['version']}." ) # build a manifest consisting of all the files. bundle_metadata = { BundleMetadata.FORMAT: BundleMetadata.FILE_FORMAT_VERSION, BundleMetadata.VERSION: version, BundleMetadata.FILES: [ { BundleFileMetadata.NAME: file['user_supplied_metadata']['name'], BundleFileMetadata.UUID: file['user_supplied_metadata']['uuid'], BundleFileMetadata.VERSION: file['user_supplied_metadata']['version'], BundleFileMetadata.CONTENT_TYPE: file['file_metadata'][FileMetadata.CONTENT_TYPE], BundleFileMetadata.SIZE: file['file_metadata'][FileMetadata.SIZE], BundleFileMetadata.INDEXED: file['user_supplied_metadata']['indexed'], BundleFileMetadata.CRC32C: file['file_metadata'][FileMetadata.CRC32C], BundleFileMetadata.S3_ETAG: file['file_metadata'][FileMetadata.S3_ETAG], BundleFileMetadata.SHA1: file['file_metadata'][FileMetadata.SHA1], BundleFileMetadata.SHA256: file['file_metadata'][FileMetadata.SHA256], } for file in files ], BundleMetadata.CREATOR_UID: json_request_body['creator_uid'], } created, idempotent = _idempotent_save( handle, bucket, bundle_manifest_key, bundle_metadata, ) if not idempotent: raise DSSException( requests.codes.conflict, "bundle_already_exists", f"bundle with UUID {uuid} and version {version} already exists" ) status_code = requests.codes.created if created else requests.codes.ok return jsonify(dict(version=version)), status_code
def _test_bundle_put(self, replica: Replica, fixtures_bucket: str): schema = replica.storage_schema bundle_uuid = str(uuid.uuid4()) file_uuid = str(uuid.uuid4()) missing_file_uuid = str(uuid.uuid4()) resp_obj = self.upload_file_wait( f"{schema}://{fixtures_bucket}/test_good_source_data/0", replica, file_uuid, bundle_uuid=bundle_uuid, ) file_version = resp_obj.json['version'] # first bundle. bundle_version = datetime_to_version_format(datetime.datetime.utcnow()) self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE")], bundle_version, ) # should be able to do this twice (i.e., same payload, same UUIDs) self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE")], bundle_version, requests.codes.ok, ) # should *NOT* be able to do this twice with different payload. self.put_bundle( replica, bundle_uuid, [(file_uuid, file_version, "LICENSE1")], bundle_version, requests.codes.conflict, ) # should *NOT* be able to upload a bundle with a missing file, but we should get requests.codes.conflict. with nestedcontext.bind(time_left=lambda: 0): resp_obj = self.put_bundle( replica, bundle_uuid, [ (file_uuid, file_version, "LICENSE0"), (missing_file_uuid, file_version, "LICENSE1"), ], expected_code=requests.codes.conflict, ) self.assertEqual(resp_obj.json['code'], "file_missing") # should *NOT* be able to upload a bundle containing a file with an incorrect bundle_uuid # but we should get requests.codes.conflict with nestedcontext.bind(time_left=lambda: 0): resp_obj = self.put_bundle( replica, str(uuid.uuid4()), # uploading new bundle with old file [(file_uuid, file_version, "LICENSE")], datetime_to_version_format(datetime.datetime.utcnow()), expected_code=requests.codes.conflict, ) self.assertEqual(resp_obj.json['code'], "incorrect_file_bundle_uuid") # uploads a file, but delete the file metadata. put it back after a delay. self.upload_file_wait( f"{schema}://{fixtures_bucket}/test_good_source_data/0", replica, missing_file_uuid, file_version, bundle_uuid=bundle_uuid ) handle = Config.get_blobstore_handle(replica) bucket = replica.bucket file_metadata = handle.get(bucket, f"files/{missing_file_uuid}.{file_version}") handle.delete(bucket, f"files/{missing_file_uuid}.{file_version}") class UploadThread(threading.Thread): def run(innerself): time.sleep(5) data_fh = io.BytesIO(file_metadata) handle.upload_file_handle(bucket, f"files/{missing_file_uuid}.{file_version}", data_fh) # start the upload (on a delay...) upload_thread = UploadThread() upload_thread.start() # this should at first fail to find one of the files, but the UploadThread will eventually upload the file # metadata. since we give the upload bundle process ample time to spin, it should eventually find the file # metadata and succeed. with nestedcontext.bind(time_left=lambda: sys.maxsize): self.put_bundle( replica, bundle_uuid, [ (file_uuid, file_version, "LICENSE0"), (missing_file_uuid, file_version, "LICENSE1"), ], expected_code=requests.codes.created, )
def test_notify_or_queue(self): replica = Replica.aws metadata_document = dict() subscription = { SubscriptionData.REPLICA: replica, SubscriptionData.OWNER: "bob", SubscriptionData.UUID: str(uuid4()), } with self.subTest("Should attempt to notify immediately"): with mock.patch( "dss.events.handlers.notify_v2.notify") as mock_notify: with mock.patch.object(SQSMessenger, "send") as mock_send: md = dict(**metadata_document, **dict(event_type="CREATE")) key = f"bundles/{uuid4()}.{datetime_to_version_format(datetime.datetime.now())}" notify_or_queue(replica, subscription, md, key) mock_notify.assert_called() mock_send.assert_not_called() keys = [a[0][2] for a in mock_notify.call_args_list] self.assertIn(key, keys) with self.subTest("Should queue when notify fails"): with mock.patch( "dss.events.handlers.notify_v2.notify") as mock_notify: mock_notify.return_value = False with mock.patch.object(SQSMessenger, "send", mock_send): md = dict(**metadata_document, **dict(event_type="CREATE")) key = f"bundles/{uuid4()}.{datetime_to_version_format(datetime.datetime.now())}" notify_or_queue(Replica.aws, subscription, md, key) mock_notify.assert_called() mock_send.assert_called() keys = [ json.loads(a[0][0])['key'] for a in mock_send.call_args_list ] self.assertIn(key, keys) with self.subTest( "notify_or_queue should attempt to notify immediately for versioned tombstone" ): with mock.patch( "dss.events.handlers.notify_v2.notify") as mock_notify: with mock.patch("dss.events.handlers.notify_v2._list_prefix" ) as mock_list_prefix: md = dict(**metadata_document, **dict(event_type="TOMBSTONE")) bundle_uuid = str(uuid4()) bundle_version = datetime_to_version_format( datetime.datetime.utcnow()) key = f"bundles/{bundle_uuid}.{bundle_version}.dead" mock_list_prefix.return_value = [key] notify_or_queue(Replica.aws, subscription, md, key) mock_notify.assert_called_with(subscription, md, key) keys = [a[0][2] for a in mock_notify.call_args_list] self.assertIn(key, keys) with self.subTest( "notify_or_queue should queue notifications for unversioned tombstone" ): md = dict(**metadata_document, **dict(event_type="TOMBSTONE")) bundle_uuid = str(uuid4()) bundle_version_1 = datetime_to_version_format( datetime.datetime.utcnow()) bundle_version_2 = datetime_to_version_format( datetime.datetime.utcnow()) bundle_key_1 = f"bundles/{bundle_uuid}.{bundle_version_1}" bundle_key_2 = f"bundles/{bundle_uuid}.{bundle_version_2}" unversioned_tombstone_key = f"bundles/{bundle_uuid}.dead" with mock.patch.object(SQSMessenger, "send") as mock_send: with mock.patch("dss.events.handlers.notify_v2._list_prefix" ) as mock_list_prefix: mock_list_prefix.return_value = [ unversioned_tombstone_key, bundle_key_1, bundle_key_2, ] notify_or_queue(Replica.aws, subscription, md, unversioned_tombstone_key) keys = [ json.loads(a[0][0])['key'] for a in mock_send.call_args_list ] self.assertIn(bundle_key_1, keys) self.assertIn(bundle_key_2, keys) with self.subTest( "notify_or_queue should not re-queue tombstones versions of unversioned tombstones" ): md = dict(**metadata_document, **dict(event_type="TOMBSTONE")) bundle_uuid = str(uuid4()) bundle_version_1 = datetime_to_version_format( datetime.datetime.utcnow()) bundle_version_2 = datetime_to_version_format( datetime.datetime.utcnow()) bundle_key_1 = f"bundles/{bundle_uuid}.{bundle_version_1}" bundle_key_2 = f"bundles/{bundle_uuid}.{bundle_version_2}.dead" unversioned_tombstone_key = f"bundles/{bundle_uuid}.dead" with mock.patch.object(SQSMessenger, "send") as mock_send: with mock.patch("dss.events.handlers.notify_v2._list_prefix" ) as mock_list_prefix: mock_list_prefix.return_value = [ unversioned_tombstone_key, bundle_key_1, bundle_key_2, ] notify_or_queue(Replica.aws, subscription, md, unversioned_tombstone_key) mock_send.assert_called_once() keys = [ json.loads(a[0][0])['key'] for a in mock_send.call_args_list ] self.assertIn(bundle_key_1, keys) self.assertNotIn(bundle_key_2, keys)
def _test_file_get_checkout(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader): handle = Config.get_blobstore_handle(replica) src_key = generate_test_key() src_data = os.urandom(1024) source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # write dummy file and upload to upload area with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "text/plain") # upload file to DSS self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query( "replica", replica.name).add_query("version", version)) # get uploaded blob key file_metadata = json.loads( handle.get(test_bucket, f"files/{file_uuid}.{version}").decode("utf-8")) file_key = compose_blob_key(file_metadata) @eventually(20, 1) def test_checkout(): # assert 302 and verify checksum on checkout completion api_get = self.assertGetResponse(url, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=0) file_get = requests.get(api_get.response.headers['Location']) self.assertTrue(file_get.ok) self.assertEquals(file_get.content, src_data) with self.subTest( f"{replica}: Initiates checkout and returns 301 for GET on 'uncheckedout' file." ): # assert 301 redirect on first GET self.assertGetResponse(url, requests.codes.moved, headers=get_auth_header(), redirect_follow_retries=0) test_checkout() with self.subTest( f"{replica}: Initiates checkout and returns 301 for GET on nearly expired checkout file." ): now = datetime.datetime.now(datetime.timezone.utc) creation_date_fn = ( "cloud_blobstore.s3.S3BlobStore.get_creation_date" if replica.name == "aws" else "cloud_blobstore.gs.GSBlobStore.get_creation_date") with mock.patch(creation_date_fn) as mock_creation_date: blob_ttl_days = int(os.environ['DSS_BLOB_TTL_DAYS']) mock_creation_date.return_value = now - datetime.timedelta( days=blob_ttl_days, hours=1, minutes=5) self.assertGetResponse(url, requests.codes.moved, headers=get_auth_header(), redirect_follow_retries=0) test_checkout() with self.subTest( f"{replica}: Initiates checkout and returns 302 immediately for GET on stale checkout file." ): now = datetime.datetime.now(datetime.timezone.utc) creation_date = handle.get_creation_date(replica.checkout_bucket, file_key) creation_date_fn = ( "cloud_blobstore.s3.S3BlobStore.get_creation_date" if replica.name == "aws" else "cloud_blobstore.gs.GSBlobStore.get_creation_date") with mock.patch(creation_date_fn) as mock_creation_date: # assert 302 found on stale file and that last modified refreshes blob_ttl_days = int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS']) mock_creation_date.return_value = now - datetime.timedelta( days=blob_ttl_days + 1) self.assertGetResponse(url, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=0) self.assertTrue( creation_date > handle.get_creation_date( replica.checkout_bucket, file_key), f'\ncurr_creation_date: {creation_date}' f'\nprev_creation_date: {handle.get_creation_date(replica.checkout_bucket)}' ) handle.delete(test_bucket, f"files/{file_uuid}.{version}") handle.delete(replica.checkout_bucket, file_key)
def _test_file_put(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader): src_key = generate_test_key() src_data = os.urandom(1024) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "text/plain") source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) self._test_put_auth_errors(scheme, test_bucket) with self.subTest( f"{replica}: Created returned when uploading a file with a unique payload, and FQID" ): self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) with self.subTest( f"{replica}: Created returned when uploading a file with same payload, and different FQID" ): self.upload_file(source_url, str(uuid.uuid4())) with self.subTest( f"{replica}: OK returned when uploading a file with the same payload, UUID, version" ): self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version, expected_code=requests.codes.ok) with self.subTest( f"{replica}: Conflict returned when uploading a file with a different payload and same FQID" ): src_key_temp = generate_test_key() src_data_temp = os.urandom(128) with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data_temp) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key_temp, "text/plain") source_url_temp = f"{scheme}://{test_bucket}/{src_key_temp}" self.upload_file(source_url_temp, file_uuid, version=version, expected_code=requests.codes.conflict) with self.subTest( f"{replica}: Bad returned when uploading a file with an invalid version" ): self.upload_file(source_url, file_uuid, version='', expected_code=requests.codes.bad_request) invalid_version = 'ABCD' with self.subTest( f"{replica}: bad_request returned " f"when uploading a file with invalid version {invalid_version}" ): self.upload_file(source_url, file_uuid, version=invalid_version, expected_code=requests.codes.bad_request) with self.subTest( f"{replica}: Bad returned when uploading a file without a version" ): self.upload_file(source_url, file_uuid, version='missing', expected_code=requests.codes.bad_request) invalid_uuids = ['ABCD', '1234'] for invalid_uuid in invalid_uuids: with self.subTest( f"{replica}: Bad returned " f"when uploading a file with invalid UUID {invalid_uuid}"): self.upload_file(source_url, invalid_uuid, expected_code=requests.codes.bad_request) with self.subTest( f"{replica}: forbidden returned " f"when uploading a file with without UUID {invalid_uuid}"): self.upload_file(source_url, '', expected_code=requests.codes.forbidden)
def put(uuid: str, json_request_body: dict, version: str = None): class CopyMode(Enum): NO_COPY = auto() COPY_INLINE = auto() COPY_ASYNC = auto() uuid = uuid.lower() if version is not None: # convert it to date-time so we can format exactly as the system requires (with microsecond precision) timestamp = iso8601.parse_date(version) else: timestamp = datetime.datetime.utcnow() version = datetime_to_version_format(timestamp) source_url = json_request_body['source_url'] cre = re.compile("^" "(?P<schema>(?:s3|gs|wasb))" "://" "(?P<bucket>[^/]+)" "/" "(?P<key>.+)" "$") mobj = cre.match(source_url) if mobj and mobj.group('schema') == "s3": replica = Replica.aws elif mobj and mobj.group('schema') == "gs": replica = Replica.gcp else: schema = mobj.group('schema') raise DSSException(requests.codes.bad_request, "unknown_source_schema", f"source_url schema {schema} not supported") handle = Config.get_blobstore_handle(replica) hca_handle = Config.get_hcablobstore_handle(replica) dst_bucket = replica.bucket src_bucket = mobj.group('bucket') src_key = mobj.group('key') metadata = handle.get_user_metadata(src_bucket, src_key) size = handle.get_size(src_bucket, src_key) content_type = handle.get_content_type(src_bucket, src_key) # format all the checksums so they're lower-case. for metadata_spec in HCABlobStore.MANDATORY_METADATA.values(): if metadata_spec['downcase']: keyname = typing.cast(str, metadata_spec['keyname']) metadata[keyname] = metadata[keyname].lower() # what's the target object name for the actual data? dst_key = ("blobs/" + ".".join(( metadata['hca-dss-sha256'], metadata['hca-dss-sha1'], metadata['hca-dss-s3_etag'], metadata['hca-dss-crc32c'], ))).lower() # does it exist? if so, we can skip the copy part. copy_mode = CopyMode.COPY_INLINE try: if hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata): copy_mode = CopyMode.NO_COPY except BlobNotFoundError: pass # build the json document for the file metadata. file_metadata = { FileMetadata.FORMAT: FileMetadata.FILE_FORMAT_VERSION, FileMetadata.BUNDLE_UUID: json_request_body['bundle_uuid'], FileMetadata.CREATOR_UID: json_request_body['creator_uid'], FileMetadata.VERSION: version, FileMetadata.CONTENT_TYPE: content_type, FileMetadata.SIZE: size, FileMetadata.CRC32C: metadata['hca-dss-crc32c'], FileMetadata.S3_ETAG: metadata['hca-dss-s3_etag'], FileMetadata.SHA1: metadata['hca-dss-sha1'], FileMetadata.SHA256: metadata['hca-dss-sha256'], } file_metadata_json = json.dumps(file_metadata) if copy_mode != CopyMode.NO_COPY and size > ASYNC_COPY_THRESHOLD: copy_mode = CopyMode.COPY_ASYNC if copy_mode == CopyMode.COPY_ASYNC: if replica == Replica.aws: state = s3copyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-s3-copy-write-metadata-sfn-{stage}" elif replica == Replica.gcp: state = gscopyclient.copy_write_metadata_sfn_event( src_bucket, src_key, dst_bucket, dst_key, uuid, version, file_metadata_json, ) state_machine_name_template = "dss-gs-copy-write-metadata-sfn-{stage}" else: raise ValueError("Unhandled replica") execution_id = str(uuid4()) stepfunctions.step_functions_invoke(state_machine_name_template, execution_id, state) return jsonify(dict(task_id=execution_id, version=version)), requests.codes.accepted elif copy_mode == CopyMode.COPY_INLINE: handle.copy(src_bucket, src_key, dst_bucket, dst_key) # verify the copy was done correctly. assert hca_handle.verify_blob_checksum(dst_bucket, dst_key, metadata) try: write_file_metadata(handle, dst_bucket, uuid, version, file_metadata_json) status_code = requests.codes.created except BlobAlreadyExistsError: # fetch the file metadata, compare it to what we have. existing_file_metadata = json.loads( handle.get(dst_bucket, "files/{}.{}".format(uuid, version)).decode("utf-8")) if existing_file_metadata != file_metadata: raise DSSException( requests.codes.conflict, "file_already_exists", f"file with UUID {uuid} and version {version} already exists") status_code = requests.codes.ok return jsonify(dict(version=version)), status_code