def test_assay_upload_ingestion_success(clean_db, monkeypatch, caplog): """Check that the ingestion success method works as expected""" caplog.set_level(logging.DEBUG) new_user = Users.create(PROFILE) trial = TrialMetadata.create(TRIAL_ID, METADATA) assay_upload = UploadJobs.create( upload_type="ihc", uploader_email=EMAIL, gcs_file_map={}, metadata={PROTOCOL_ID_FIELD_NAME: TRIAL_ID}, gcs_xlsx_uri="", commit=False, ) clean_db.commit() # Ensure that success can't be declared from a starting state with pytest.raises(Exception, match="current status"): assay_upload.ingestion_success(trial) # Update assay_upload status to simulate a completed but not ingested upload assay_upload.status = UploadJobStatus.UPLOAD_COMPLETED.value assay_upload.ingestion_success(trial) # Check that status was updated and email wasn't sent by default db_record = UploadJobs.find_by_id(assay_upload.id) assert db_record.status == UploadJobStatus.MERGE_COMPLETED.value assert "Would send email with subject '[UPLOAD SUCCESS]" not in caplog.text # Check that email gets sent when specified assay_upload.ingestion_success(trial, send_email=True) assert "Would send email with subject '[UPLOAD SUCCESS]" in caplog.text
def test_merge_extra_metadata(cidc_api, clean_db, monkeypatch): """Ensure merging of extra metadata follows the expected execution flow""" user_id = setup_trial_and_user(cidc_api, monkeypatch) with cidc_api.app_context(): user = Users.find_by_id(user_id) make_cimac_biofx_user(user_id, cidc_api) with cidc_api.app_context(): assay_upload = UploadJobs.create( upload_type="assay_with_extra_md", uploader_email=user.email, gcs_file_map={}, metadata={ PROTOCOL_ID_FIELD_NAME: trial_id, "whatever": { "hierarchy": [ { "we just need a": "uuid-1", "to be able": "to merge" }, { "and": "uuid-2" }, ] }, }, gcs_xlsx_uri="", commit=False, ) assay_upload.id = 137 assay_upload.insert() custom_extra_md_parse = MagicMock() custom_extra_md_parse.side_effect = lambda f: { "extra_md": f.read().decode() } monkeypatch.setattr( "cidc_schemas.prism.merger.EXTRA_METADATA_PARSERS", {"assay_with_extra_md": custom_extra_md_parse}, ) form_data = { "job_id": 137, "uuid-1": (io.BytesIO(b"fake file 1"), "fname1"), "uuid-2": (io.BytesIO(b"fake file 2"), "fname2"), } client = cidc_api.test_client() res = client.post("/ingestion/extra-assay-metadata", data=form_data) assert res.status_code == 200 assert custom_extra_md_parse.call_count == 2 fetched_jobs = UploadJobs.list() assert 1 == len(fetched_jobs) au = fetched_jobs[0] assert "extra_md" in au.metadata_patch["whatever"]["hierarchy"][0] assert "extra_md" in au.metadata_patch["whatever"]["hierarchy"][1]
def saved_failure_status(job: UploadJobs, session): """Save an upload failure to the database before raising an exception.""" try: yield except Exception as e: job.status = UploadJobStatus.MERGE_FAILED.value job.status_details = str(e) session.commit() raise e
def test_create_upload_job(db): """Try to create an upload job""" new_user = Users.create(PROFILE) gcs_file_uris = ["my/first/wes/blob1", "my/first/wes/blob2"] metadata_json_patch = {"foo": "bar"} # Create a fresh upload job new_job = UploadJobs.create("dummy_assay", EMAIL, gcs_file_uris, metadata_json_patch) job = UploadJobs.find_by_id(new_job.id) assert_same_elements(new_job.gcs_file_uris, job.gcs_file_uris) assert job.status == "started"
def test_upload_job_no_file_map(clean_db): """Try to create an assay upload""" new_user = Users.create(PROFILE) metadata_patch = {PROTOCOL_ID_FIELD_NAME: TRIAL_ID} gcs_xlsx_uri = "xlsx/assays/wes/12:0:1.5123095" TrialMetadata.create(TRIAL_ID, METADATA) new_job = UploadJobs.create( prism.SUPPORTED_MANIFESTS[0], EMAIL, None, metadata_patch, gcs_xlsx_uri ) assert list(new_job.upload_uris_with_data_uris_with_uuids()) == [] job = UploadJobs.find_by_id_and_email(new_job.id, PROFILE["email"]) assert list(job.upload_uris_with_data_uris_with_uuids()) == []
def test_assay_or_analysis_preconditions(monkeypatch): """Ensure derive_files_from_assay_or_analysis_upload blocks derivation under the expected conditions.""" find_by_id = MagicMock() find_by_id.return_value = None monkeypatch.setattr(upload_postprocessing.UploadJobs, "find_by_id", find_by_id) with pytest.raises(Exception, match="No upload record with id"): upload_postprocessing.derive_files_from_assay_or_analysis_upload( event, None) find_by_id.return_value = upload_job = UploadJobs(trial_id="foo") upload_job._set_status_no_validation(UploadJobStatus.MERGE_FAILED.value) with pytest.raises(Exception, match="status is merge-failed"): upload_postprocessing.derive_files_from_assay_or_analysis_upload( event, None) upload_job._set_status_no_validation(UploadJobStatus.MERGE_COMPLETED.value) # Ensure that file derivation happens so long as upload record exists _derive_files = MagicMock() monkeypatch.setattr(upload_postprocessing, "_derive_files_from_upload", _derive_files) upload_postprocessing.derive_files_from_assay_or_analysis_upload( event, None) _derive_files.assert_called()
def derive_files_from_assay_or_analysis_upload(event: dict, context: BackgroundContext): """ Generate derivative files from an assay or analysis upload. """ upload_id = extract_pubsub_data(event) with sqlalchemy_session() as session: upload_record: UploadJobs = UploadJobs.find_by_id(upload_id, session=session) if not upload_record: raise Exception(f"No upload record with id {upload_id} found.") if UploadJobStatus( upload_record.status) != UploadJobStatus.MERGE_COMPLETED: raise Exception( f"Cannot perform postprocessing on upload {upload_id}: status is {upload_record.status}" ) print( f"Received completed assay/analysis upload {upload_id} for postprocessing." ) # Run the file derivation _derive_files_from_upload(upload_record.trial_id, upload_record.upload_type, session)
def test_poll_upload_merge_status(cidc_api, clean_db, monkeypatch): """ Check pull_upload_merge_status endpoint behavior """ user_id = setup_trial_and_user(cidc_api, monkeypatch) with cidc_api.app_context(): user = Users.find_by_id(user_id) make_cimac_biofx_user(user_id, cidc_api) metadata = {PROTOCOL_ID_FIELD_NAME: trial_id} with cidc_api.app_context(): other_user = Users(email="*****@*****.**") other_user.insert() upload_job = UploadJobs.create( upload_type="wes", uploader_email=user.email, gcs_file_map={}, metadata=metadata, gcs_xlsx_uri="", ) upload_job.insert() upload_job_id = upload_job.id client = cidc_api.test_client() # Upload not found res = client.get( f"/ingestion/poll_upload_merge_status/12345?token={upload_job.token}") assert res.status_code == 404 upload_job_url = ( f"/ingestion/poll_upload_merge_status/{upload_job_id}?token={upload_job.token}" ) # Upload not-yet-ready res = client.get(upload_job_url) assert res.status_code == 200 assert "retry_in" in res.json and res.json["retry_in"] == 5 assert "status" not in res.json test_details = "A human-friendly reason for this " for status in [ UploadJobStatus.MERGE_COMPLETED.value, UploadJobStatus.MERGE_FAILED.value, ]: # Simulate cloud function merge status update with cidc_api.app_context(): upload_job._set_status_no_validation(status) upload_job.status_details = test_details upload_job.update() # Upload ready res = client.get(upload_job_url) assert res.status_code == 200 assert "retry_in" not in res.json assert "status" in res.json and res.json["status"] == status assert ("status_details" in res.json and res.json["status_details"] == test_details)
def test_assay_upload_merge_extra_metadata(clean_db, monkeypatch): """Try to create an assay upload""" new_user = Users.create(PROFILE) TrialMetadata.create(TRIAL_ID, METADATA) assay_upload = UploadJobs.create( upload_type="assay_with_extra_md", uploader_email=EMAIL, gcs_file_map={}, metadata={ PROTOCOL_ID_FIELD_NAME: TRIAL_ID, "whatever": { "hierarchy": [ {"we just need a": "uuid-1", "to be able": "to merge"}, {"and": "uuid-2"}, ] }, }, gcs_xlsx_uri="", commit=False, ) assay_upload.id = 111 clean_db.commit() custom_extra_md_parse = MagicMock() custom_extra_md_parse.side_effect = lambda f: {"extra": f.read().decode()} monkeypatch.setattr( "cidc_schemas.prism.merger.EXTRA_METADATA_PARSERS", {"assay_with_extra_md": custom_extra_md_parse}, ) UploadJobs.merge_extra_metadata( 111, { "uuid-1": io.BytesIO(b"within extra md file 1"), "uuid-2": io.BytesIO(b"within extra md file 2"), }, session=clean_db, ) assert 1 == clean_db.query(UploadJobs).count() au = clean_db.query(UploadJobs).first() assert "extra" in au.metadata_patch["whatever"]["hierarchy"][0] assert "extra" in au.metadata_patch["whatever"]["hierarchy"][1]
def test_requires_upload_token_auth(cidc_api, clean_db, monkeypatch): """Check that the requires_upload_token_auth decorator works as expected""" user_id = setup_trial_and_user(cidc_api, monkeypatch) job_id = setup_upload_jobs(cidc_api)[0] with cidc_api.app_context(): job = UploadJobs.find_by_id(job_id) test_route = "/foobarfoo" @requires_upload_token_auth def endpoint(*args, **kwargs): assert "upload_job" in kwargs return "ok", 200 query_route = f"{test_route}/{job_id}" nonexistent_job_id = "9999999" # User must provide `token` query param with cidc_api.test_request_context(query_route): with pytest.raises(UnprocessableEntity) as e: endpoint(upload_job=job_id) assert e._excinfo[1].data["messages"]["query"]["token"] == [ "Missing data for required field." ] # User must provide correct `token` query param with cidc_api.test_request_context(f"{query_route}?token={'bad token'}"): with pytest.raises(Unauthorized, match="upload_job token authentication failed"): endpoint(upload_job=job_id) with cidc_api.test_request_context(f"{query_route}?token={job.token}"): assert endpoint(upload_job=job_id) == ("ok", 200) # User whose id token authentication succeeds gets a 404 if the relevant job doesn't exist with cidc_api.test_request_context( f"{test_route}/{nonexistent_job_id}?token={job.token}"): with pytest.raises(NotFound): endpoint(upload_job=nonexistent_job_id) monkeypatch.setattr( "cidc_api.resources.upload_jobs.authenticate_and_get_user", lambda *args, **kwargs: None, ) # User whose id token authentication fails can still successfully authenticate # using an upload token. with cidc_api.test_request_context(f"{query_route}?token={job.token}"): assert endpoint(upload_job=job_id) == ("ok", 200) # User whose id token authentication fails gets a 401 if the relevant job doesn't exist with cidc_api.test_request_context( f"{test_route}/{nonexistent_job_id}?token={job.token}"): with pytest.raises(Unauthorized, match="upload_job token authentication failed"): endpoint(upload_job=nonexistent_job_id)
def setup_db_records(cidc_api): extra = {"_etag": ETAG} with cidc_api.app_context(): Users(**users["json"], **extra).insert(compute_etag=False) TrialMetadata(**trial_metadata["json"], **extra).insert(compute_etag=False) DownloadableFiles(**downloadable_files["json"], **extra).insert( compute_etag=False ) Permissions(**permissions["json"], **extra).insert(compute_etag=False) UploadJobs(**upload_jobs["json"], **extra).insert(compute_etag=False)
def test_new_upload_alert(monkeypatch): vals = {"id": 1, "trial_id": "foo", "uploader_email": "*****@*****.**"} gen_confs = MagicMock() gen_confs.side_effect = ( lambda ct, patch, template_type, bucket: {"attach.file": "content"} if "wes" in template_type else {} ) monkeypatch.setattr( "cidc_api.shared.emails.generate_analysis_configs_from_upload_patch", gen_confs ) for upload, full_ct, expected_att in [ ( UploadJobs( **vals, upload_type="wes_bam", metadata_patch={"assays": {"wes": []}} ), {"assays": {"wes": []}}, [ { "content": "Y29udGVudA==", # "content" base64 encoded "filename": "attach.file", "type": "application/yaml", } ], ), (UploadJobs(**vals, upload_type="pbmc", metadata_patch={}), {}, None), ]: email = new_upload_alert(upload, full_ct) assert "UPLOAD SUCCESS" in email["subject"] assert email["to_emails"] == [CIDC_MAILING_LIST] for val in vals.values(): assert str(val) in email["html_content"] assert gen_confs.called_once() assert email.get("attachments") == expected_att
def test_create_assay_upload(clean_db): """Try to create an assay upload""" new_user = Users.create(PROFILE) gcs_file_map = { "my/first/wes/blob1/2019-08-30T15:51:38.450978": "test-uuid-1", "my/first/wes/blob2/2019-08-30T15:51:38.450978": "test-uuid-2", } metadata_patch = {PROTOCOL_ID_FIELD_NAME: TRIAL_ID} gcs_xlsx_uri = "xlsx/assays/wes/12:0:1.5123095" # Should fail, since trial doesn't exist yet with pytest.raises(IntegrityError): UploadJobs.create("wes_bam", EMAIL, gcs_file_map, metadata_patch, gcs_xlsx_uri) clean_db.rollback() TrialMetadata.create(TRIAL_ID, METADATA) new_job = UploadJobs.create( "wes_bam", EMAIL, gcs_file_map, metadata_patch, gcs_xlsx_uri ) job = UploadJobs.find_by_id_and_email(new_job.id, PROFILE["email"]) assert len(new_job.gcs_file_map) == len(job.gcs_file_map) assert set(new_job.gcs_file_map) == set(job.gcs_file_map) assert job.status == "started" assert list(job.upload_uris_with_data_uris_with_uuids()) == [ ( "my/first/wes/blob1/2019-08-30T15:51:38.450978", "my/first/wes/blob1", "test-uuid-1", ), ( "my/first/wes/blob2/2019-08-30T15:51:38.450978", "my/first/wes/blob2", "test-uuid-2", ), ]
def setup_upload_jobs(cidc_api) -> Tuple[int, int]: """ Insert two uploads into the database created by different users and return their IDs. """ with cidc_api.app_context(): other_user = Users(email="*****@*****.**") other_user.insert() job1 = UploadJobs( uploader_email=user_email, trial_id=trial_id, status=UploadJobStatus.STARTED.value, metadata_patch={ "test": { "upload_placeholder": "baz" }, "test2": "foo" }, upload_type="", gcs_xlsx_uri="", gcs_file_map={"bip": "baz"}, multifile=False, ) job2 = UploadJobs( uploader_email=other_user.email, trial_id=trial_id, status=UploadJobStatus.STARTED.value, metadata_patch={ "array": [{ "upload_placeholder": "baz" }, { "test2": "foo" }] }, upload_type="", gcs_xlsx_uri="", gcs_file_map={"bip": "baz"}, multifile=False, ) job1.insert() job2.insert() return job1.id, job2.id
def derive_files_from_manifest_upload(event: dict, context: BackgroundContext): """ Generate derivative files from a manifest upload. """ upload_id = extract_pubsub_data(event) with sqlalchemy_session() as session: upload_record: UploadJobs = UploadJobs.find_by_id(upload_id, session=session) if not upload_record: raise Exception( f"No manifest upload record found with id {upload_id}.") print( f"Received completed manifest upload {upload_id} for postprocessing." ) # Run the file derivation _derive_files_from_upload(upload_record.trial_id, upload_record.upload_type, session)
def test_upload_olink(cidc_api, clean_db, monkeypatch): """Ensure the upload endpoint follows the expected execution flow""" user_id = setup_trial_and_user(cidc_api, monkeypatch) with cidc_api.app_context(): user = Users.find_by_id(user_id) make_cimac_biofx_user(user_id, cidc_api) client = cidc_api.test_client() mocks = UploadMocks( monkeypatch, prismify_file_entries=[ finfo(lp, url, "uuid" + str(i), "npx" in url, False) for i, (lp, url) in enumerate(OLINK_TESTDATA) ], ) # No permission to upload yet res = client.post(ASSAY_UPLOAD, data=form_data("olink.xlsx", io.BytesIO(b"1234"), "olink")) assert res.status_code == 401 assert "not authorized to upload olink data" in str( res.json["_error"]["message"]) mocks.clear_all() # Give permission and retry grant_upload_permission(user_id, "olink", cidc_api) res = client.post(ASSAY_UPLOAD, data=form_data("olink.xlsx", io.BytesIO(b"1234"), "olink")) assert res.status_code == 200 assert "url_mapping" in res.json url_mapping = res.json["url_mapping"] # Olink assay has extra_metadata files assert "extra_metadata" in res.json extra_metadata = res.json["extra_metadata"] assert type(extra_metadata) == dict # We expect local_path to map to a gcs object name with gcs_prefix. for local_path, gcs_prefix in OLINK_TESTDATA: gcs_object_name = url_mapping[local_path] assert local_path in url_mapping assert gcs_object_name.startswith(gcs_prefix) assert (local_path not in gcs_object_name ), "PHI from local_path shouldn't end up in gcs urls" # Check that we tried to grant IAM upload access to gcs_object_name mocks.grant_write.assert_called_with(user.email) # Check that we tried to upload the assay metadata excel file mocks.upload_xlsx.assert_called_once() job_id = res.json["job_id"] update_url = f"/upload_jobs/{job_id}" # Report an upload failure res = client.patch( f"{update_url}?token={res.json['token']}", json={"status": UploadJobStatus.UPLOAD_FAILED.value}, headers={"If-Match": res.json["job_etag"]}, ) assert res.status_code == 200 mocks.revoke_write.assert_called_with(user.email) # This was an upload failure, so success shouldn't have been published mocks.publish_success.assert_not_called() # Test upload status validation - since the upload job's current status # is UPLOAD_FAILED, the API shouldn't permit this status to be updated to # UPLOAD_COMPLETED. bad_res = client.patch( f"{update_url}?token={res.json['token']}", json={"status": UploadJobStatus.UPLOAD_COMPLETED.value}, headers={"If-Match": res.json["_etag"]}, ) assert bad_res.status_code == 400 assert ("status upload-failed can't transition to status upload-completed" in bad_res.json["_error"]["message"]) # Reset the upload status and try the request again with cidc_api.app_context(): job = UploadJobs.find_by_id_and_email(job_id, user.email) job._set_status_no_validation(UploadJobStatus.STARTED.value) job.update() _etag = job._etag res = client.patch( f"{update_url}?token={res.json['token']}", json={"status": UploadJobStatus.UPLOAD_COMPLETED.value}, headers={"If-Match": _etag}, ) assert res.status_code == 200 mocks.publish_success.assert_called_with(job_id)
def test_upload_wes(cidc_api, clean_db, monkeypatch): """Ensure the upload endpoint follows the expected execution flow""" user_id = setup_trial_and_user(cidc_api, monkeypatch) make_cimac_biofx_user(user_id, cidc_api) with cidc_api.app_context(): user = Users.find_by_id(user_id) client = cidc_api.test_client() mocks = UploadMocks( monkeypatch, prismify_file_entries=[ finfo("localfile.ext", "test_trial/url/file.ext", "uuid-1", None, False) ], ) # No permission to upload yet res = client.post(ASSAY_UPLOAD, data=form_data("wes.xlsx", io.BytesIO(b"1234"), "wes_fastq")) assert res.status_code == 401 assert "not authorized to upload wes_fastq data" in str( res.json["_error"]["message"]) mocks.clear_all() # Give permission and retry grant_upload_permission(user_id, "wes_fastq", cidc_api) res = client.post(ASSAY_UPLOAD, data=form_data("wes.xlsx", io.BytesIO(b"1234"), "wes_fastq")) assert res.status_code == 200 assert "url_mapping" in res.json url_mapping = res.json["url_mapping"] # WES assay does not have any extra_metadata files, but its (and every assay's) response # should have an extra_metadata field. assert "extra_metadata" in res.json extra_metadata = res.json["extra_metadata"] assert extra_metadata is None # We expect local_path to map to a gcs object name with gcs_prefix local_path = "localfile.ext" gcs_prefix = "test_trial/url/file.ext" gcs_object_name = url_mapping[local_path] assert local_path in url_mapping assert gcs_object_name.startswith(gcs_prefix) assert not gcs_object_name.endswith( local_path), "PHI from local_path shouldn't end up in gcs urls" # Check that we tried to grant IAM upload access to gcs_object_name mocks.grant_write.assert_called_with(user.email) # Check that we tried to upload the assay metadata excel file mocks.upload_xlsx.assert_called_once() job_id = res.json["job_id"] update_url = f"/upload_jobs/{job_id}" # Report an upload failure res = client.patch( f"{update_url}?token={res.json['token']}", json={"status": UploadJobStatus.UPLOAD_FAILED.value}, headers={"If-Match": res.json["job_etag"]}, ) assert res.status_code == 200 mocks.revoke_write.assert_called_with(user.email) # This was an upload failure, so success shouldn't have been published mocks.publish_success.assert_not_called() # Reset the upload status and try the request again with cidc_api.app_context(): job = UploadJobs.find_by_id_and_email(job_id, user.email) job._set_status_no_validation(UploadJobStatus.STARTED.value) job.update() _etag = job._etag # Report an upload success res = client.patch( f"{update_url}?token={res.json['token']}", json={"status": UploadJobStatus.UPLOAD_COMPLETED.value}, headers={"If-Match": _etag}, ) assert res.status_code == 200 mocks.publish_success.assert_called_with(job_id)
def test_update_upload_job(cidc_api, clean_db, monkeypatch): """Check that getting a updating an upload job by ID works as expected.""" user_id = setup_trial_and_user(cidc_api, monkeypatch) user_job, other_job = setup_upload_jobs(cidc_api) with cidc_api.app_context(): user_job_record = UploadJobs.find_by_id(user_job) other_job_record = UploadJobs.find_by_id(other_job) publish_success = MagicMock() monkeypatch.setattr("cidc_api.shared.gcloud_client.publish_upload_success", publish_success) revoke_upload_access = MagicMock() monkeypatch.setattr("cidc_api.shared.gcloud_client.revoke_upload_access", revoke_upload_access) client = cidc_api.test_client() # Possible patches upload_success = {"status": UploadJobStatus.UPLOAD_COMPLETED.value} upload_failure = {"status": UploadJobStatus.UPLOAD_FAILED.value} invalid_update = {"status": UploadJobStatus.MERGE_COMPLETED.value} # A user gets error if they fail to provide an upload token res = client.patch(f"/upload_jobs/{other_job}", json=upload_success) assert res.status_code == 422 publish_success.assert_not_called() revoke_upload_access.assert_not_called() # A user gets an authentication error if they provide an incorrect upload token res = client.patch( f"/upload_jobs/{other_job}?token=nope", headers={"if-match": other_job_record._etag}, json=upload_success, ) assert res.status_code == 401 assert res.json["_error"][ "message"] == "upload_job token authentication failed" publish_success.assert_not_called() revoke_upload_access.assert_not_called() # A user gets an error if they try to update something besides the job's status res = client.patch( f"/upload_jobs/{other_job}?token={other_job_record.token}", headers={"if-match": other_job_record._etag}, json={ "uploader_email": "*****@*****.**", "status": "" }, ) assert res.status_code == 422 assert res.json["_error"]["message"]["uploader_email"][ 0] == "Unknown field." # A user providing a correct token can update their job's status to be a failure res = client.patch( f"/upload_jobs/{other_job}?token={other_job_record.token}", headers={"if-match": other_job_record._etag}, json={ "gcs_file_map": { "foo": "bar" }, **upload_failure }, ) assert res.status_code == 200 publish_success.assert_not_called() revoke_upload_access.assert_called_once() revoke_upload_access.reset_mock() with cidc_api.app_context(): modified_job = UploadJobs.find_by_id(other_job) assert modified_job.metadata_patch == {"array": [{"test2": "foo"}]} user_job_record._set_status_no_validation( UploadJobStatus.STARTED.value) user_job_record.update() # A user can update a job to be a success # Also allows for updating the gcs_file_map and thereby the metadata_patch res = client.patch( f"/upload_jobs/{user_job}?token={user_job_record.token}", headers={"if-match": user_job_record._etag}, json={ "gcs_file_map": { "foo": "bar" }, **upload_success }, ) assert res.status_code == 200 publish_success.assert_called_once_with(user_job) revoke_upload_access.assert_called_once() with cidc_api.app_context(): modified_job = UploadJobs.find_by_id(user_job) assert modified_job.gcs_file_map == {"foo": "bar"} assert modified_job.metadata_patch == {"test2": "foo"} publish_success.reset_mock() revoke_upload_access.reset_mock() with cidc_api.app_context(): user_job_record._set_status_no_validation( UploadJobStatus.STARTED.value) user_job_record.update() # Users can't make an illegal state transition res = client.patch( f"/upload_jobs/{user_job}?token={user_job_record.token}", headers={"if-match": user_job_record._etag}, json=invalid_update, ) assert res.status_code == 400
def ingest_upload(event: dict, context: BackgroundContext): """ When a successful upload event is published, move the data associated with the upload job into the download bucket and merge the upload metadata into the appropriate clinical trial JSON. """ storage_client = storage.Client() job_id = int(extract_pubsub_data(event)) logger.info(f"ingest_upload execution started on upload job id {job_id}") with sqlalchemy_session() as session: job: UploadJobs = UploadJobs.find_by_id(job_id, session=session) # Check ingestion pre-conditions if not job: raise Exception(f"No assay upload job with id {job_id} found.") if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED: raise Exception( f"Received ID for job with status {job.status}. Aborting ingestion." ) trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME) if not trial_id: # We should never hit this, since metadata should be pre-validated. with saved_failure_status(job, session): raise Exception( f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})." ) logger.info( f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}" ) url_bundles = [ URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids() ] # Copy GCS blobs in parallel logger.info("Copying artifacts from upload bucket to data bucket.") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status( job, session ): destination_objects = executor.map( lambda url_bundle: _gcs_copy( storage_client, GOOGLE_UPLOAD_BUCKET, url_bundle.upload_url, GOOGLE_DATA_BUCKET, url_bundle.target_url, ), url_bundles, ) metadata_patch = job.metadata_patch logger.info("Adding artifact metadata to metadata patch.") metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts( metadata_patch, job.upload_type, zip([ub.artifact_uuid for ub in url_bundles], destination_objects), ) # Add metadata for this upload to the database logger.info( "Merging metadata from upload %d into trial %s: " % (job.id, trial_id), metadata_patch, ) with saved_failure_status(job, session): trial = TrialMetadata.patch_assays( trial_id, metadata_patch, session=session ) # Save downloadable files to the database # NOTE: this needs to happen after TrialMetadata.patch_assays # in order to avoid violating a foreign-key constraint on the trial_id # in the event that this is the first upload for a trial. logger.info("Saving artifact records to the downloadable_files table.") for artifact_metadata, additional_metadata in downloadable_files: logger.debug( f"Saving metadata to downloadable_files table: {artifact_metadata}" ) DownloadableFiles.create_from_metadata( trial_id, job.upload_type, artifact_metadata, additional_metadata=additional_metadata, session=session, commit=False, ) # Additionally, make the metadata xlsx a downloadable file with saved_failure_status(job, session): _, xlsx_blob = _get_bucket_and_blob( storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri ) full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}" data_format = "Assay Metadata" facet_group = f"{job.upload_type}|{data_format}" logger.info(f"Saving {full_uri} as a downloadable_file.") DownloadableFiles.create_from_blob( trial_id, job.upload_type, data_format, facet_group, xlsx_blob, session=session, ) # Update the job metadata to include artifacts job.metadata_patch = metadata_patch # Making files downloadable by a specified biofx analysis team group assay_prefix = job.upload_type.split("_")[0] # 'wes_bam' -> 'wes' if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT: analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix] _gcs_add_prefix_reader_permission( storage_client, analysis_group_email, # to whom give access to f"{trial_id}/{assay_prefix}", # to what sub-folder ) # Save the upload success and trigger email alert if transaction succeeds job.ingestion_success(trial, session=session, send_email=True, commit=True) # Trigger post-processing on uploaded data files logger.info(f"Publishing object URLs to 'artifact_upload' topic") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor: executor.map( lambda url_bundle: publish_artifact_upload(url_bundle.target_url), url_bundles, ) # Trigger post-processing on entire upload report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC) if report: report.result() # Google won't actually do anything with this response; it's # provided for testing purposes only. return jsonify( dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles) )
def test_migrations_failures(use_upload_jobs_table, monkeypatch): """Test that changes get rolled back in potential failure scenarios.""" # Mock alembic monkeypatch.setattr(migrations, "op", MagicMock()) # Mock sqlalchemy mock_session_builder = MagicMock() mock_session = MagicMock() mock_session_builder.return_value = mock_session monkeypatch.setattr(migrations, "Session", mock_session_builder) # Mock cidc_api and prism functions trial_record = MagicMock() select_trials = MagicMock() select_trials.return_value = [trial_record] monkeypatch.setattr(migrations, "_select_trials", select_trials) df_record = MagicMock() select_df = MagicMock() select_df.return_value = df_record monkeypatch.setattr(DownloadableFiles, "get_by_object_url", select_df) select_assay_uploads = MagicMock() select_assay_uploads.return_value = [ UploadJobs(gcs_file_map={ "a_old_url/ts": "foo", "b_old_url/ts": "bar" }) ] monkeypatch.setattr(migrations, "_select_successful_assay_uploads", select_assay_uploads) if not use_upload_jobs_table: select_manifest_uploads = MagicMock() select_manifest_uploads.return_value = [MagicMock()] monkeypatch.setattr(migrations, "_select_manifest_uploads", select_manifest_uploads) new_metadata = { "some_assay": { "extra": "metadata", "files": [{ "upload_placeholder": "foo" }, { "upload_placeholder": "bar" }], } } mock_migration = MagicMock() mock_migration.return_value = MigrationResult( new_metadata, { "a_old_url": { "object_url": "a_new_url", "upload_placeholder": "foo" }, "b_old_url": { "object_url": "b_new_url", "upload_placeholder": "bar" }, }, ) rename_gcs_obj = MagicMock() monkeypatch.setattr(migrations, "rename_gcs_blob", rename_gcs_obj) def reset_mocks(): rename_gcs_obj.reset_mock() mock_session.commit.reset_mock() mock_session.rollback.reset_mock() mock_session.close.reset_mock() # GCS failure config rename_gcs_obj.side_effect = [None, Exception("gcs failure"), None] with pytest.raises(Exception, match="gcs failure"): run_metadata_migration(mock_migration, use_upload_jobs_table) # Called 3 times - task 1 succeeds, task 2 fails, task 1 rolls back assert len(rename_gcs_obj.call_args_list) == 3 mock_session.commit.assert_not_called() mock_session.rollback.assert_called_once() mock_session.close.assert_called_once() rename_gcs_obj.side_effect = None reset_mocks() # SQL failure select_assay_uploads.side_effect = Exception("sql failure") with pytest.raises(Exception, match="sql failure"): run_metadata_migration(mock_migration, use_upload_jobs_table) mock_session.commit.assert_not_called() mock_session.rollback.assert_called_once() mock_session.close.assert_called_once() # Ensure no GCS operations were carried out rename_gcs_obj.assert_not_called() reset_mocks() # No failures select_assay_uploads.side_effect = None run_metadata_migration(mock_migration, use_upload_jobs_table) # Ensure we updated trials as expected trial_record.safely_set_metadata_json.assert_called_with(new_metadata) # Ensure we updated files as expected assert df_record.object_url == "b_new_url" assert df_record.additional_metadata == {"some_assay.extra": "metadata"} # Ensure we renamed the right objects assert rename_gcs_obj.call_args_list == [ call(GOOGLE_DATA_BUCKET, "a_old_url", "a_new_url"), call(GOOGLE_DATA_BUCKET, "b_old_url", "b_new_url"), ]
def setup_data(cidc_api, clean_db): user = Users(email="*****@*****.**", approval_date=datetime.now()) shipment = { "courier": "FEDEX", "ship_to": "", "ship_from": "", "assay_type": assay_type, "manifest_id": manifest_id, "date_shipped": "2020-06-10 00:00:00", "date_received": "2020-06-11 00:00:00", "account_number": "", "assay_priority": "1", "receiving_party": "MSSM_Rahman", "tracking_number": "", "shipping_condition": "Frozen_Dry_Ice", "quality_of_shipment": "Specimen shipment received in good condition", } metadata = { "protocol_identifier": trial_id, "shipments": [ # we get duplicate shipment uploads sometimes shipment, shipment, ], "participants": [{ "cimac_participant_id": f"CTTTPP{p}", "participant_id": "x", "cohort_name": "", "samples": [{ "cimac_id": f"CTTTPP{p}SS.0{s}", "sample_location": "", "type_of_primary_container": "Other", "type_of_sample": "Other", "collection_event_name": "", "parent_sample_id": "", } for s in range(num_samples[p])], } for p in range(num_participants)], "allowed_cohort_names": [""], "allowed_collection_event_names": [""], } trial = TrialMetadata(trial_id=trial_id, metadata_json=metadata) upload_job = UploadJobs( uploader_email=user.email, trial_id=trial.trial_id, upload_type="pbmc", gcs_xlsx_uri="", metadata_patch=metadata, multifile=False, ) upload_job._set_status_no_validation(UploadJobStatus.MERGE_COMPLETED.value) with cidc_api.app_context(): user.insert() trial.insert() upload_job.insert() clean_db.refresh(user) clean_db.refresh(upload_job) clean_db.refresh(trial) return user, upload_job, trial
def test_ingest_upload(caplog, monkeypatch): """Test upload data transfer functionality""" TS_AND_PATH = "/1234/local_path1.txt" ARTIFACT = {"test-prop": "test-val"} TRIAL_ID = "CIMAC-12345" job = UploadJobs( id=JOB_ID, uploader_email="*****@*****.**", trial_id=TRIAL_ID, gcs_xlsx_uri="test.xlsx", gcs_file_map=FILE_MAP, metadata_patch={ prism.PROTOCOL_ID_FIELD_NAME: TRIAL_ID, "assays": { "wes": [{ "records": [{ "cimac_id": "CIMAC-mock-sa-id", "files": { "r1": { "upload_placeholder": "uuid1" }, "r2": { "upload_placeholder": "uuid2" }, }, }] }] }, }, status=UploadJobStatus.UPLOAD_COMPLETED.value, upload_type="wes_bam", ) # Since the test database isn't yet set up with migrations, # it won't have the correct relations in it, so we can't actually # store or retrieve data find_by_id = MagicMock() find_by_id.return_value = job monkeypatch.setattr(UploadJobs, "find_by_id", find_by_id) # Mock data transfer functionality _gcs_copy = MagicMock() _gcs_copy.side_effect = lambda storage_client, source_bucket, source_object, target_bucket, target_object: _gcs_obj_mock( target_object, 100, datetime.datetime.now(), "gsc_url_mock_md5", "gsc_url_mock_crc32c", ) monkeypatch.setattr("functions.uploads._gcs_copy", _gcs_copy) _get_bucket_and_blob = MagicMock() xlsx_blob = MagicMock() _get_bucket_and_blob.return_value = None, xlsx_blob monkeypatch.setattr("functions.uploads._get_bucket_and_blob", _get_bucket_and_blob) monkeypatch.setattr( "functions.uploads.GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT", {"wes": "analysis-group@email"}, ) # mocking `google.cloud.storage.Client()` to not actually create a client _storage_client = MagicMock("_storage_client") monkeypatch.setattr("functions.uploads.storage.Client", lambda *a, **kw: _storage_client) _bucket = MagicMock("_bucket") _storage_client.get_bucket = lambda *a, **kw: _bucket _storage_client._connection = _connection = MagicMock("_connection") _api_request = _connection.api_request = MagicMock( "_connection.api_request") _api_request.return_value = {"bindings": []} _bucket.set_iam_policy = _set_iam_policy = MagicMock( "_bucket.set_iam_policy") _bucket.get_iam_policy = _get_iam_policy = MagicMock( "_bucket.get_iam_policy") _policy = _get_iam_policy.return_value = MagicMock("_policy") iam_prefix = f'resource.name.startsWith("projects/_/buckets/cidc-data-staging/objects/{TRIAL_ID}/wes/")' # This set up checks handling duplicate bindings _policy.bindings = [{ "role": GOOGLE_ANALYSIS_GROUP_ROLE, "members": {f"group:analysis-group@email"}, "condition": { "expression": iam_prefix }, }] # Mock metadata merging functionality _save_file = MagicMock("_save_file") monkeypatch.setattr(DownloadableFiles, "create_from_metadata", _save_file) _save_blob_file = MagicMock("_save_blob_file") monkeypatch.setattr(DownloadableFiles, "create_from_blob", _save_blob_file) _merge_metadata = MagicMock("_merge_metadata") monkeypatch.setattr(TrialMetadata, "patch_assays", _merge_metadata) publish_artifact_upload = MagicMock("publish_artifact_upload") monkeypatch.setattr(uploads, "publish_artifact_upload", publish_artifact_upload) _encode_and_publish = MagicMock("_encode_and_publish") monkeypatch.setattr(uploads, "_encode_and_publish", _encode_and_publish) successful_upload_event = make_pubsub_event(str(job.id)) response = ingest_upload(successful_upload_event, None).json assert response[URI1 + UPLOAD_DATE_PATH] == URI1 assert response[URI2 + UPLOAD_DATE_PATH] == URI2 find_by_id.assert_called_once() # Check that we copied multiple objects _gcs_copy.assert_called() and not _gcs_copy.assert_called_once() # Check that we tried to save multiple files _save_file.assert_called() and not _save_file.assert_called_once() # Check that we tried to merge metadata once _merge_metadata.assert_called_once() # Check that we got the xlsx blob metadata from GCS _get_bucket_and_blob.assert_called_with(_storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri) # Check that we created a downloadable file for the xlsx file blob assert _save_blob_file.call_args[:-1][0] == ( "CIMAC-12345", "wes_bam", "Assay Metadata", "wes_bam|Assay Metadata", xlsx_blob, ) # Check that we tried to update GCS access policy _set_iam_policy.assert_called_once() # Check that we aded GCS access for biofx team assert _policy == _set_iam_policy.call_args[0][0] assert len(_policy.bindings) == 1 assert _policy.bindings[0]["members"] == {"group:analysis-group@email"} assert _policy.bindings[0][ "role"] == "projects/cidc-dfci-staging/roles/CIDC_biofx" assert iam_prefix in _policy.bindings[0]["condition"]["expression"] _until = datetime.datetime.today() + datetime.timedelta( GOOGLE_ANALYSIS_PERMISSIONS_GRANT_FOR_DAYS) assert ( f'request.time < timestamp("{_until.date().isoformat()}T00:00:00Z")' in _policy.bindings[0]["condition"]["expression"]) # Check that the job status was updated to reflect a successful upload assert job.status == UploadJobStatus.MERGE_COMPLETED.value assert email_was_sent(caplog.text) publish_artifact_upload.assert_called() _encode_and_publish.assert_called()