示例#1
0
def derive_files_from_assay_or_analysis_upload(event: dict,
                                               context: BackgroundContext):
    """
    Generate derivative files from an assay or analysis upload.
    """
    upload_id = extract_pubsub_data(event)

    with sqlalchemy_session() as session:
        upload_record: UploadJobs = UploadJobs.find_by_id(upload_id,
                                                          session=session)

        if not upload_record:
            raise Exception(f"No upload record with id {upload_id} found.")

        if UploadJobStatus(
                upload_record.status) != UploadJobStatus.MERGE_COMPLETED:
            raise Exception(
                f"Cannot perform postprocessing on upload {upload_id}: status is {upload_record.status}"
            )

        print(
            f"Received completed assay/analysis upload {upload_id} for postprocessing."
        )

        # Run the file derivation
        _derive_files_from_upload(upload_record.trial_id,
                                  upload_record.upload_type, session)
示例#2
0
def test_assay_upload_ingestion_success(clean_db, monkeypatch, caplog):
    """Check that the ingestion success method works as expected"""
    caplog.set_level(logging.DEBUG)

    new_user = Users.create(PROFILE)
    trial = TrialMetadata.create(TRIAL_ID, METADATA)
    assay_upload = UploadJobs.create(
        upload_type="ihc",
        uploader_email=EMAIL,
        gcs_file_map={},
        metadata={PROTOCOL_ID_FIELD_NAME: TRIAL_ID},
        gcs_xlsx_uri="",
        commit=False,
    )

    clean_db.commit()

    # Ensure that success can't be declared from a starting state
    with pytest.raises(Exception, match="current status"):
        assay_upload.ingestion_success(trial)

    # Update assay_upload status to simulate a completed but not ingested upload
    assay_upload.status = UploadJobStatus.UPLOAD_COMPLETED.value
    assay_upload.ingestion_success(trial)

    # Check that status was updated and email wasn't sent by default
    db_record = UploadJobs.find_by_id(assay_upload.id)
    assert db_record.status == UploadJobStatus.MERGE_COMPLETED.value
    assert "Would send email with subject '[UPLOAD SUCCESS]" not in caplog.text

    # Check that email gets sent when specified
    assay_upload.ingestion_success(trial, send_email=True)
    assert "Would send email with subject '[UPLOAD SUCCESS]" in caplog.text
def test_requires_upload_token_auth(cidc_api, clean_db, monkeypatch):
    """Check that the requires_upload_token_auth decorator works as expected"""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    job_id = setup_upload_jobs(cidc_api)[0]
    with cidc_api.app_context():
        job = UploadJobs.find_by_id(job_id)

    test_route = "/foobarfoo"

    @requires_upload_token_auth
    def endpoint(*args, **kwargs):
        assert "upload_job" in kwargs
        return "ok", 200

    query_route = f"{test_route}/{job_id}"
    nonexistent_job_id = "9999999"

    # User must provide `token` query param
    with cidc_api.test_request_context(query_route):
        with pytest.raises(UnprocessableEntity) as e:
            endpoint(upload_job=job_id)
        assert e._excinfo[1].data["messages"]["query"]["token"] == [
            "Missing data for required field."
        ]

    # User must provide correct `token` query param
    with cidc_api.test_request_context(f"{query_route}?token={'bad token'}"):
        with pytest.raises(Unauthorized,
                           match="upload_job token authentication failed"):
            endpoint(upload_job=job_id)

    with cidc_api.test_request_context(f"{query_route}?token={job.token}"):
        assert endpoint(upload_job=job_id) == ("ok", 200)

    # User whose id token authentication succeeds gets a 404 if the relevant job doesn't exist
    with cidc_api.test_request_context(
            f"{test_route}/{nonexistent_job_id}?token={job.token}"):
        with pytest.raises(NotFound):
            endpoint(upload_job=nonexistent_job_id)

    monkeypatch.setattr(
        "cidc_api.resources.upload_jobs.authenticate_and_get_user",
        lambda *args, **kwargs: None,
    )

    # User whose id token authentication fails can still successfully authenticate
    # using an upload token.
    with cidc_api.test_request_context(f"{query_route}?token={job.token}"):
        assert endpoint(upload_job=job_id) == ("ok", 200)

    # User whose id token authentication fails gets a 401 if the relevant job doesn't exist
    with cidc_api.test_request_context(
            f"{test_route}/{nonexistent_job_id}?token={job.token}"):
        with pytest.raises(Unauthorized,
                           match="upload_job token authentication failed"):
            endpoint(upload_job=nonexistent_job_id)
示例#4
0
def test_create_upload_job(db):
    """Try to create an upload job"""
    new_user = Users.create(PROFILE)

    gcs_file_uris = ["my/first/wes/blob1", "my/first/wes/blob2"]
    metadata_json_patch = {"foo": "bar"}

    # Create a fresh upload job
    new_job = UploadJobs.create("dummy_assay", EMAIL, gcs_file_uris,
                                metadata_json_patch)
    job = UploadJobs.find_by_id(new_job.id)
    assert_same_elements(new_job.gcs_file_uris, job.gcs_file_uris)
    assert job.status == "started"
示例#5
0
def derive_files_from_manifest_upload(event: dict, context: BackgroundContext):
    """
    Generate derivative files from a manifest upload.
    """
    upload_id = extract_pubsub_data(event)

    with sqlalchemy_session() as session:
        upload_record: UploadJobs = UploadJobs.find_by_id(upload_id,
                                                          session=session)
        if not upload_record:
            raise Exception(
                f"No manifest upload record found with id {upload_id}.")

        print(
            f"Received completed manifest upload {upload_id} for postprocessing."
        )

        # Run the file derivation
        _derive_files_from_upload(upload_record.trial_id,
                                  upload_record.upload_type, session)
def test_update_upload_job(cidc_api, clean_db, monkeypatch):
    """Check that getting a updating an upload job by ID works as expected."""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    user_job, other_job = setup_upload_jobs(cidc_api)
    with cidc_api.app_context():
        user_job_record = UploadJobs.find_by_id(user_job)
        other_job_record = UploadJobs.find_by_id(other_job)

    publish_success = MagicMock()
    monkeypatch.setattr("cidc_api.shared.gcloud_client.publish_upload_success",
                        publish_success)
    revoke_upload_access = MagicMock()
    monkeypatch.setattr("cidc_api.shared.gcloud_client.revoke_upload_access",
                        revoke_upload_access)

    client = cidc_api.test_client()

    # Possible patches
    upload_success = {"status": UploadJobStatus.UPLOAD_COMPLETED.value}
    upload_failure = {"status": UploadJobStatus.UPLOAD_FAILED.value}
    invalid_update = {"status": UploadJobStatus.MERGE_COMPLETED.value}

    # A user gets error if they fail to provide an upload token
    res = client.patch(f"/upload_jobs/{other_job}", json=upload_success)
    assert res.status_code == 422
    publish_success.assert_not_called()
    revoke_upload_access.assert_not_called()

    # A user gets an authentication error if they provide an incorrect upload token
    res = client.patch(
        f"/upload_jobs/{other_job}?token=nope",
        headers={"if-match": other_job_record._etag},
        json=upload_success,
    )
    assert res.status_code == 401
    assert res.json["_error"][
        "message"] == "upload_job token authentication failed"
    publish_success.assert_not_called()
    revoke_upload_access.assert_not_called()

    # A user gets an error if they try to update something besides the job's status
    res = client.patch(
        f"/upload_jobs/{other_job}?token={other_job_record.token}",
        headers={"if-match": other_job_record._etag},
        json={
            "uploader_email": "*****@*****.**",
            "status": ""
        },
    )
    assert res.status_code == 422
    assert res.json["_error"]["message"]["uploader_email"][
        0] == "Unknown field."

    # A user providing a correct token can update their job's status to be a failure
    res = client.patch(
        f"/upload_jobs/{other_job}?token={other_job_record.token}",
        headers={"if-match": other_job_record._etag},
        json={
            "gcs_file_map": {
                "foo": "bar"
            },
            **upload_failure
        },
    )
    assert res.status_code == 200
    publish_success.assert_not_called()
    revoke_upload_access.assert_called_once()
    revoke_upload_access.reset_mock()

    with cidc_api.app_context():
        modified_job = UploadJobs.find_by_id(other_job)
        assert modified_job.metadata_patch == {"array": [{"test2": "foo"}]}
        user_job_record._set_status_no_validation(
            UploadJobStatus.STARTED.value)
        user_job_record.update()

    # A user can update a job to be a success
    # Also allows for updating the gcs_file_map and thereby the metadata_patch
    res = client.patch(
        f"/upload_jobs/{user_job}?token={user_job_record.token}",
        headers={"if-match": user_job_record._etag},
        json={
            "gcs_file_map": {
                "foo": "bar"
            },
            **upload_success
        },
    )
    assert res.status_code == 200
    publish_success.assert_called_once_with(user_job)
    revoke_upload_access.assert_called_once()
    with cidc_api.app_context():
        modified_job = UploadJobs.find_by_id(user_job)
        assert modified_job.gcs_file_map == {"foo": "bar"}
        assert modified_job.metadata_patch == {"test2": "foo"}

    publish_success.reset_mock()
    revoke_upload_access.reset_mock()

    with cidc_api.app_context():
        user_job_record._set_status_no_validation(
            UploadJobStatus.STARTED.value)
        user_job_record.update()

    # Users can't make an illegal state transition
    res = client.patch(
        f"/upload_jobs/{user_job}?token={user_job_record.token}",
        headers={"if-match": user_job_record._etag},
        json=invalid_update,
    )
    assert res.status_code == 400
def ingest_upload(event: dict, context: BackgroundContext):
    """
    When a successful upload event is published, move the data associated
    with the upload job into the download bucket and merge the upload metadata
    into the appropriate clinical trial JSON.
    """
    storage_client = storage.Client()

    job_id = int(extract_pubsub_data(event))

    logger.info(f"ingest_upload execution started on upload job id {job_id}")

    with sqlalchemy_session() as session:
        job: UploadJobs = UploadJobs.find_by_id(job_id, session=session)

        # Check ingestion pre-conditions
        if not job:
            raise Exception(f"No assay upload job with id {job_id} found.")
        if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED:
            raise Exception(
                f"Received ID for job with status {job.status}. Aborting ingestion."
            )
        trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME)
        if not trial_id:
            # We should never hit this, since metadata should be pre-validated.
            with saved_failure_status(job, session):
                raise Exception(
                    f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})."
                )

        logger.info(
            f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}"
        )

        url_bundles = [
            URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids()
        ]

        # Copy GCS blobs in parallel
        logger.info("Copying artifacts from upload bucket to data bucket.")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status(
            job, session
        ):
            destination_objects = executor.map(
                lambda url_bundle: _gcs_copy(
                    storage_client,
                    GOOGLE_UPLOAD_BUCKET,
                    url_bundle.upload_url,
                    GOOGLE_DATA_BUCKET,
                    url_bundle.target_url,
                ),
                url_bundles,
            )

        metadata_patch = job.metadata_patch
        logger.info("Adding artifact metadata to metadata patch.")
        metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts(
            metadata_patch,
            job.upload_type,
            zip([ub.artifact_uuid for ub in url_bundles], destination_objects),
        )

        # Add metadata for this upload to the database
        logger.info(
            "Merging metadata from upload %d into trial %s: " % (job.id, trial_id),
            metadata_patch,
        )
        with saved_failure_status(job, session):
            trial = TrialMetadata.patch_assays(
                trial_id, metadata_patch, session=session
            )

        # Save downloadable files to the database
        # NOTE: this needs to happen after TrialMetadata.patch_assays
        # in order to avoid violating a foreign-key constraint on the trial_id
        # in the event that this is the first upload for a trial.
        logger.info("Saving artifact records to the downloadable_files table.")
        for artifact_metadata, additional_metadata in downloadable_files:
            logger.debug(
                f"Saving metadata to downloadable_files table: {artifact_metadata}"
            )
            DownloadableFiles.create_from_metadata(
                trial_id,
                job.upload_type,
                artifact_metadata,
                additional_metadata=additional_metadata,
                session=session,
                commit=False,
            )

        # Additionally, make the metadata xlsx a downloadable file
        with saved_failure_status(job, session):
            _, xlsx_blob = _get_bucket_and_blob(
                storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri
            )
            full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}"
            data_format = "Assay Metadata"
            facet_group = f"{job.upload_type}|{data_format}"
            logger.info(f"Saving {full_uri} as a downloadable_file.")
            DownloadableFiles.create_from_blob(
                trial_id,
                job.upload_type,
                data_format,
                facet_group,
                xlsx_blob,
                session=session,
            )

        # Update the job metadata to include artifacts
        job.metadata_patch = metadata_patch

        # Making files downloadable by a specified biofx analysis team group
        assay_prefix = job.upload_type.split("_")[0]  # 'wes_bam' -> 'wes'
        if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT:
            analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix]
            _gcs_add_prefix_reader_permission(
                storage_client,
                analysis_group_email,  # to whom give access to
                f"{trial_id}/{assay_prefix}",  # to what sub-folder
            )

        # Save the upload success and trigger email alert if transaction succeeds
        job.ingestion_success(trial, session=session, send_email=True, commit=True)

        # Trigger post-processing on uploaded data files
        logger.info(f"Publishing object URLs to 'artifact_upload' topic")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor:
            executor.map(
                lambda url_bundle: publish_artifact_upload(url_bundle.target_url),
                url_bundles,
            )

        # Trigger post-processing on entire upload
        report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC)
        if report:
            report.result()

    # Google won't actually do anything with this response; it's
    # provided for testing purposes only.
    return jsonify(
        dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles)
    )