def derive_files_from_assay_or_analysis_upload(event: dict, context: BackgroundContext): """ Generate derivative files from an assay or analysis upload. """ upload_id = extract_pubsub_data(event) with sqlalchemy_session() as session: upload_record: UploadJobs = UploadJobs.find_by_id(upload_id, session=session) if not upload_record: raise Exception(f"No upload record with id {upload_id} found.") if UploadJobStatus( upload_record.status) != UploadJobStatus.MERGE_COMPLETED: raise Exception( f"Cannot perform postprocessing on upload {upload_id}: status is {upload_record.status}" ) print( f"Received completed assay/analysis upload {upload_id} for postprocessing." ) # Run the file derivation _derive_files_from_upload(upload_record.trial_id, upload_record.upload_type, session)
def test_assay_upload_ingestion_success(clean_db, monkeypatch, caplog): """Check that the ingestion success method works as expected""" caplog.set_level(logging.DEBUG) new_user = Users.create(PROFILE) trial = TrialMetadata.create(TRIAL_ID, METADATA) assay_upload = UploadJobs.create( upload_type="ihc", uploader_email=EMAIL, gcs_file_map={}, metadata={PROTOCOL_ID_FIELD_NAME: TRIAL_ID}, gcs_xlsx_uri="", commit=False, ) clean_db.commit() # Ensure that success can't be declared from a starting state with pytest.raises(Exception, match="current status"): assay_upload.ingestion_success(trial) # Update assay_upload status to simulate a completed but not ingested upload assay_upload.status = UploadJobStatus.UPLOAD_COMPLETED.value assay_upload.ingestion_success(trial) # Check that status was updated and email wasn't sent by default db_record = UploadJobs.find_by_id(assay_upload.id) assert db_record.status == UploadJobStatus.MERGE_COMPLETED.value assert "Would send email with subject '[UPLOAD SUCCESS]" not in caplog.text # Check that email gets sent when specified assay_upload.ingestion_success(trial, send_email=True) assert "Would send email with subject '[UPLOAD SUCCESS]" in caplog.text
def test_requires_upload_token_auth(cidc_api, clean_db, monkeypatch): """Check that the requires_upload_token_auth decorator works as expected""" user_id = setup_trial_and_user(cidc_api, monkeypatch) job_id = setup_upload_jobs(cidc_api)[0] with cidc_api.app_context(): job = UploadJobs.find_by_id(job_id) test_route = "/foobarfoo" @requires_upload_token_auth def endpoint(*args, **kwargs): assert "upload_job" in kwargs return "ok", 200 query_route = f"{test_route}/{job_id}" nonexistent_job_id = "9999999" # User must provide `token` query param with cidc_api.test_request_context(query_route): with pytest.raises(UnprocessableEntity) as e: endpoint(upload_job=job_id) assert e._excinfo[1].data["messages"]["query"]["token"] == [ "Missing data for required field." ] # User must provide correct `token` query param with cidc_api.test_request_context(f"{query_route}?token={'bad token'}"): with pytest.raises(Unauthorized, match="upload_job token authentication failed"): endpoint(upload_job=job_id) with cidc_api.test_request_context(f"{query_route}?token={job.token}"): assert endpoint(upload_job=job_id) == ("ok", 200) # User whose id token authentication succeeds gets a 404 if the relevant job doesn't exist with cidc_api.test_request_context( f"{test_route}/{nonexistent_job_id}?token={job.token}"): with pytest.raises(NotFound): endpoint(upload_job=nonexistent_job_id) monkeypatch.setattr( "cidc_api.resources.upload_jobs.authenticate_and_get_user", lambda *args, **kwargs: None, ) # User whose id token authentication fails can still successfully authenticate # using an upload token. with cidc_api.test_request_context(f"{query_route}?token={job.token}"): assert endpoint(upload_job=job_id) == ("ok", 200) # User whose id token authentication fails gets a 401 if the relevant job doesn't exist with cidc_api.test_request_context( f"{test_route}/{nonexistent_job_id}?token={job.token}"): with pytest.raises(Unauthorized, match="upload_job token authentication failed"): endpoint(upload_job=nonexistent_job_id)
def test_create_upload_job(db): """Try to create an upload job""" new_user = Users.create(PROFILE) gcs_file_uris = ["my/first/wes/blob1", "my/first/wes/blob2"] metadata_json_patch = {"foo": "bar"} # Create a fresh upload job new_job = UploadJobs.create("dummy_assay", EMAIL, gcs_file_uris, metadata_json_patch) job = UploadJobs.find_by_id(new_job.id) assert_same_elements(new_job.gcs_file_uris, job.gcs_file_uris) assert job.status == "started"
def derive_files_from_manifest_upload(event: dict, context: BackgroundContext): """ Generate derivative files from a manifest upload. """ upload_id = extract_pubsub_data(event) with sqlalchemy_session() as session: upload_record: UploadJobs = UploadJobs.find_by_id(upload_id, session=session) if not upload_record: raise Exception( f"No manifest upload record found with id {upload_id}.") print( f"Received completed manifest upload {upload_id} for postprocessing." ) # Run the file derivation _derive_files_from_upload(upload_record.trial_id, upload_record.upload_type, session)
def test_update_upload_job(cidc_api, clean_db, monkeypatch): """Check that getting a updating an upload job by ID works as expected.""" user_id = setup_trial_and_user(cidc_api, monkeypatch) user_job, other_job = setup_upload_jobs(cidc_api) with cidc_api.app_context(): user_job_record = UploadJobs.find_by_id(user_job) other_job_record = UploadJobs.find_by_id(other_job) publish_success = MagicMock() monkeypatch.setattr("cidc_api.shared.gcloud_client.publish_upload_success", publish_success) revoke_upload_access = MagicMock() monkeypatch.setattr("cidc_api.shared.gcloud_client.revoke_upload_access", revoke_upload_access) client = cidc_api.test_client() # Possible patches upload_success = {"status": UploadJobStatus.UPLOAD_COMPLETED.value} upload_failure = {"status": UploadJobStatus.UPLOAD_FAILED.value} invalid_update = {"status": UploadJobStatus.MERGE_COMPLETED.value} # A user gets error if they fail to provide an upload token res = client.patch(f"/upload_jobs/{other_job}", json=upload_success) assert res.status_code == 422 publish_success.assert_not_called() revoke_upload_access.assert_not_called() # A user gets an authentication error if they provide an incorrect upload token res = client.patch( f"/upload_jobs/{other_job}?token=nope", headers={"if-match": other_job_record._etag}, json=upload_success, ) assert res.status_code == 401 assert res.json["_error"][ "message"] == "upload_job token authentication failed" publish_success.assert_not_called() revoke_upload_access.assert_not_called() # A user gets an error if they try to update something besides the job's status res = client.patch( f"/upload_jobs/{other_job}?token={other_job_record.token}", headers={"if-match": other_job_record._etag}, json={ "uploader_email": "*****@*****.**", "status": "" }, ) assert res.status_code == 422 assert res.json["_error"]["message"]["uploader_email"][ 0] == "Unknown field." # A user providing a correct token can update their job's status to be a failure res = client.patch( f"/upload_jobs/{other_job}?token={other_job_record.token}", headers={"if-match": other_job_record._etag}, json={ "gcs_file_map": { "foo": "bar" }, **upload_failure }, ) assert res.status_code == 200 publish_success.assert_not_called() revoke_upload_access.assert_called_once() revoke_upload_access.reset_mock() with cidc_api.app_context(): modified_job = UploadJobs.find_by_id(other_job) assert modified_job.metadata_patch == {"array": [{"test2": "foo"}]} user_job_record._set_status_no_validation( UploadJobStatus.STARTED.value) user_job_record.update() # A user can update a job to be a success # Also allows for updating the gcs_file_map and thereby the metadata_patch res = client.patch( f"/upload_jobs/{user_job}?token={user_job_record.token}", headers={"if-match": user_job_record._etag}, json={ "gcs_file_map": { "foo": "bar" }, **upload_success }, ) assert res.status_code == 200 publish_success.assert_called_once_with(user_job) revoke_upload_access.assert_called_once() with cidc_api.app_context(): modified_job = UploadJobs.find_by_id(user_job) assert modified_job.gcs_file_map == {"foo": "bar"} assert modified_job.metadata_patch == {"test2": "foo"} publish_success.reset_mock() revoke_upload_access.reset_mock() with cidc_api.app_context(): user_job_record._set_status_no_validation( UploadJobStatus.STARTED.value) user_job_record.update() # Users can't make an illegal state transition res = client.patch( f"/upload_jobs/{user_job}?token={user_job_record.token}", headers={"if-match": user_job_record._etag}, json=invalid_update, ) assert res.status_code == 400
def ingest_upload(event: dict, context: BackgroundContext): """ When a successful upload event is published, move the data associated with the upload job into the download bucket and merge the upload metadata into the appropriate clinical trial JSON. """ storage_client = storage.Client() job_id = int(extract_pubsub_data(event)) logger.info(f"ingest_upload execution started on upload job id {job_id}") with sqlalchemy_session() as session: job: UploadJobs = UploadJobs.find_by_id(job_id, session=session) # Check ingestion pre-conditions if not job: raise Exception(f"No assay upload job with id {job_id} found.") if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED: raise Exception( f"Received ID for job with status {job.status}. Aborting ingestion." ) trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME) if not trial_id: # We should never hit this, since metadata should be pre-validated. with saved_failure_status(job, session): raise Exception( f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})." ) logger.info( f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}" ) url_bundles = [ URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids() ] # Copy GCS blobs in parallel logger.info("Copying artifacts from upload bucket to data bucket.") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status( job, session ): destination_objects = executor.map( lambda url_bundle: _gcs_copy( storage_client, GOOGLE_UPLOAD_BUCKET, url_bundle.upload_url, GOOGLE_DATA_BUCKET, url_bundle.target_url, ), url_bundles, ) metadata_patch = job.metadata_patch logger.info("Adding artifact metadata to metadata patch.") metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts( metadata_patch, job.upload_type, zip([ub.artifact_uuid for ub in url_bundles], destination_objects), ) # Add metadata for this upload to the database logger.info( "Merging metadata from upload %d into trial %s: " % (job.id, trial_id), metadata_patch, ) with saved_failure_status(job, session): trial = TrialMetadata.patch_assays( trial_id, metadata_patch, session=session ) # Save downloadable files to the database # NOTE: this needs to happen after TrialMetadata.patch_assays # in order to avoid violating a foreign-key constraint on the trial_id # in the event that this is the first upload for a trial. logger.info("Saving artifact records to the downloadable_files table.") for artifact_metadata, additional_metadata in downloadable_files: logger.debug( f"Saving metadata to downloadable_files table: {artifact_metadata}" ) DownloadableFiles.create_from_metadata( trial_id, job.upload_type, artifact_metadata, additional_metadata=additional_metadata, session=session, commit=False, ) # Additionally, make the metadata xlsx a downloadable file with saved_failure_status(job, session): _, xlsx_blob = _get_bucket_and_blob( storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri ) full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}" data_format = "Assay Metadata" facet_group = f"{job.upload_type}|{data_format}" logger.info(f"Saving {full_uri} as a downloadable_file.") DownloadableFiles.create_from_blob( trial_id, job.upload_type, data_format, facet_group, xlsx_blob, session=session, ) # Update the job metadata to include artifacts job.metadata_patch = metadata_patch # Making files downloadable by a specified biofx analysis team group assay_prefix = job.upload_type.split("_")[0] # 'wes_bam' -> 'wes' if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT: analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix] _gcs_add_prefix_reader_permission( storage_client, analysis_group_email, # to whom give access to f"{trial_id}/{assay_prefix}", # to what sub-folder ) # Save the upload success and trigger email alert if transaction succeeds job.ingestion_success(trial, session=session, send_email=True, commit=True) # Trigger post-processing on uploaded data files logger.info(f"Publishing object URLs to 'artifact_upload' topic") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor: executor.map( lambda url_bundle: publish_artifact_upload(url_bundle.target_url), url_bundles, ) # Trigger post-processing on entire upload report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC) if report: report.result() # Google won't actually do anything with this response; it's # provided for testing purposes only. return jsonify( dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles) )