def _derive_files_from_upload(trial_id: str, upload_type: str, session): # Get trial metadata JSON for the associated trial trial_record: TrialMetadata = TrialMetadata.find_by_trial_id( trial_id, session=session) # Run the file derivation derivation_context = unprism.DeriveFilesContext(trial_record.metadata_json, upload_type, fetch_artifact) derivation_result = unprism.derive_files(derivation_context) # TODO: consider parallelizing this step if necessary for artifact in derivation_result.artifacts: # Save to GCS blob = upload_to_data_bucket(artifact.object_url, artifact.data) # Build basic facet group facet_group = f"{artifact.data_format}|{artifact.file_type}" # Save to database df_record = DownloadableFiles.create_from_blob( trial_record.trial_id, artifact.file_type, artifact.data_format, facet_group, blob, session=session, alert_artifact_upload=True, ) df_record.additional_metadata = artifact.metadata # Assume that a derived file will be directly useful for data analysis df_record.analysis_friendly = True # Update the trial metadata blob (in case the file derivation modified it) trial_record.metadata_json = derivation_result.trial_metadata session.commit()
def test_create_downloadable_file_from_blob(clean_db, monkeypatch): """Try to create a downloadable file from a GCS blob""" fake_blob = MagicMock() fake_blob.name = "name" fake_blob.md5_hash = "12345" fake_blob.crc32c = "54321" fake_blob.size = 5 fake_blob.time_created = datetime.now() clean_db.add( TrialMetadata( trial_id="id", metadata_json={ "protocol_identifier": "id", "allowed_collection_event_names": [], "allowed_cohort_names": [], "participants": [], }, ) ) df = DownloadableFiles.create_from_blob( "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob ) # Mock artifact upload publishing publisher = MagicMock() monkeypatch.setattr("cidc_api.models.models.publish_artifact_upload", publisher) # Check that the file was created assert 1 == clean_db.query(DownloadableFiles).count() df_lookup = DownloadableFiles.find_by_id(df.id) assert df_lookup.object_url == fake_blob.name assert df_lookup.data_format == "Shipping Manifest" assert df_lookup.file_size_bytes == fake_blob.size assert df_lookup.md5_hash == fake_blob.md5_hash assert df_lookup.crc32c_hash == fake_blob.crc32c # uploading second time to check non duplicating entries fake_blob.size = 6 fake_blob.md5_hash = "6" df = DownloadableFiles.create_from_blob( "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob ) # Check that the file was created assert 1 == clean_db.query(DownloadableFiles).count() df_lookup = DownloadableFiles.find_by_id(df.id) assert df_lookup.file_size_bytes == 6 assert df_lookup.md5_hash == "6" # Check that no artifact upload event was published publisher.assert_not_called() # Check that artifact upload publishes DownloadableFiles.create_from_blob( "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob, alert_artifact_upload=True, ) publisher.assert_called_once_with(fake_blob.name)
def ingest_upload(event: dict, context: BackgroundContext): """ When a successful upload event is published, move the data associated with the upload job into the download bucket and merge the upload metadata into the appropriate clinical trial JSON. """ storage_client = storage.Client() job_id = int(extract_pubsub_data(event)) logger.info(f"ingest_upload execution started on upload job id {job_id}") with sqlalchemy_session() as session: job: UploadJobs = UploadJobs.find_by_id(job_id, session=session) # Check ingestion pre-conditions if not job: raise Exception(f"No assay upload job with id {job_id} found.") if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED: raise Exception( f"Received ID for job with status {job.status}. Aborting ingestion." ) trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME) if not trial_id: # We should never hit this, since metadata should be pre-validated. with saved_failure_status(job, session): raise Exception( f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})." ) logger.info( f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}" ) url_bundles = [ URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids() ] # Copy GCS blobs in parallel logger.info("Copying artifacts from upload bucket to data bucket.") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status( job, session ): destination_objects = executor.map( lambda url_bundle: _gcs_copy( storage_client, GOOGLE_UPLOAD_BUCKET, url_bundle.upload_url, GOOGLE_DATA_BUCKET, url_bundle.target_url, ), url_bundles, ) metadata_patch = job.metadata_patch logger.info("Adding artifact metadata to metadata patch.") metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts( metadata_patch, job.upload_type, zip([ub.artifact_uuid for ub in url_bundles], destination_objects), ) # Add metadata for this upload to the database logger.info( "Merging metadata from upload %d into trial %s: " % (job.id, trial_id), metadata_patch, ) with saved_failure_status(job, session): trial = TrialMetadata.patch_assays( trial_id, metadata_patch, session=session ) # Save downloadable files to the database # NOTE: this needs to happen after TrialMetadata.patch_assays # in order to avoid violating a foreign-key constraint on the trial_id # in the event that this is the first upload for a trial. logger.info("Saving artifact records to the downloadable_files table.") for artifact_metadata, additional_metadata in downloadable_files: logger.debug( f"Saving metadata to downloadable_files table: {artifact_metadata}" ) DownloadableFiles.create_from_metadata( trial_id, job.upload_type, artifact_metadata, additional_metadata=additional_metadata, session=session, commit=False, ) # Additionally, make the metadata xlsx a downloadable file with saved_failure_status(job, session): _, xlsx_blob = _get_bucket_and_blob( storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri ) full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}" data_format = "Assay Metadata" facet_group = f"{job.upload_type}|{data_format}" logger.info(f"Saving {full_uri} as a downloadable_file.") DownloadableFiles.create_from_blob( trial_id, job.upload_type, data_format, facet_group, xlsx_blob, session=session, ) # Update the job metadata to include artifacts job.metadata_patch = metadata_patch # Making files downloadable by a specified biofx analysis team group assay_prefix = job.upload_type.split("_")[0] # 'wes_bam' -> 'wes' if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT: analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix] _gcs_add_prefix_reader_permission( storage_client, analysis_group_email, # to whom give access to f"{trial_id}/{assay_prefix}", # to what sub-folder ) # Save the upload success and trigger email alert if transaction succeeds job.ingestion_success(trial, session=session, send_email=True, commit=True) # Trigger post-processing on uploaded data files logger.info(f"Publishing object URLs to 'artifact_upload' topic") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor: executor.map( lambda url_bundle: publish_artifact_upload(url_bundle.target_url), url_bundles, ) # Trigger post-processing on entire upload report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC) if report: report.result() # Google won't actually do anything with this response; it's # provided for testing purposes only. return jsonify( dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles) )