示例#1
0
def test_assay_upload_ingestion_success(clean_db, monkeypatch, caplog):
    """Check that the ingestion success method works as expected"""
    caplog.set_level(logging.DEBUG)

    new_user = Users.create(PROFILE)
    trial = TrialMetadata.create(TRIAL_ID, METADATA)
    assay_upload = UploadJobs.create(
        upload_type="ihc",
        uploader_email=EMAIL,
        gcs_file_map={},
        metadata={PROTOCOL_ID_FIELD_NAME: TRIAL_ID},
        gcs_xlsx_uri="",
        commit=False,
    )

    clean_db.commit()

    # Ensure that success can't be declared from a starting state
    with pytest.raises(Exception, match="current status"):
        assay_upload.ingestion_success(trial)

    # Update assay_upload status to simulate a completed but not ingested upload
    assay_upload.status = UploadJobStatus.UPLOAD_COMPLETED.value
    assay_upload.ingestion_success(trial)

    # Check that status was updated and email wasn't sent by default
    db_record = UploadJobs.find_by_id(assay_upload.id)
    assert db_record.status == UploadJobStatus.MERGE_COMPLETED.value
    assert "Would send email with subject '[UPLOAD SUCCESS]" not in caplog.text

    # Check that email gets sent when specified
    assay_upload.ingestion_success(trial, send_email=True)
    assert "Would send email with subject '[UPLOAD SUCCESS]" in caplog.text
def test_merge_extra_metadata(cidc_api, clean_db, monkeypatch):
    """Ensure merging of extra metadata follows the expected execution flow"""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    with cidc_api.app_context():
        user = Users.find_by_id(user_id)
    make_cimac_biofx_user(user_id, cidc_api)

    with cidc_api.app_context():
        assay_upload = UploadJobs.create(
            upload_type="assay_with_extra_md",
            uploader_email=user.email,
            gcs_file_map={},
            metadata={
                PROTOCOL_ID_FIELD_NAME: trial_id,
                "whatever": {
                    "hierarchy": [
                        {
                            "we just need a": "uuid-1",
                            "to be able": "to merge"
                        },
                        {
                            "and": "uuid-2"
                        },
                    ]
                },
            },
            gcs_xlsx_uri="",
            commit=False,
        )
        assay_upload.id = 137
        assay_upload.insert()

        custom_extra_md_parse = MagicMock()
        custom_extra_md_parse.side_effect = lambda f: {
            "extra_md": f.read().decode()
        }
        monkeypatch.setattr(
            "cidc_schemas.prism.merger.EXTRA_METADATA_PARSERS",
            {"assay_with_extra_md": custom_extra_md_parse},
        )

        form_data = {
            "job_id": 137,
            "uuid-1": (io.BytesIO(b"fake file 1"), "fname1"),
            "uuid-2": (io.BytesIO(b"fake file 2"), "fname2"),
        }

        client = cidc_api.test_client()
        res = client.post("/ingestion/extra-assay-metadata", data=form_data)
        assert res.status_code == 200
        assert custom_extra_md_parse.call_count == 2

        fetched_jobs = UploadJobs.list()
        assert 1 == len(fetched_jobs)
        au = fetched_jobs[0]
        assert "extra_md" in au.metadata_patch["whatever"]["hierarchy"][0]
        assert "extra_md" in au.metadata_patch["whatever"]["hierarchy"][1]
def saved_failure_status(job: UploadJobs, session):
    """Save an upload failure to the database before raising an exception."""
    try:
        yield
    except Exception as e:
        job.status = UploadJobStatus.MERGE_FAILED.value
        job.status_details = str(e)
        session.commit()
        raise e
示例#4
0
def test_create_upload_job(db):
    """Try to create an upload job"""
    new_user = Users.create(PROFILE)

    gcs_file_uris = ["my/first/wes/blob1", "my/first/wes/blob2"]
    metadata_json_patch = {"foo": "bar"}

    # Create a fresh upload job
    new_job = UploadJobs.create("dummy_assay", EMAIL, gcs_file_uris,
                                metadata_json_patch)
    job = UploadJobs.find_by_id(new_job.id)
    assert_same_elements(new_job.gcs_file_uris, job.gcs_file_uris)
    assert job.status == "started"
示例#5
0
def test_upload_job_no_file_map(clean_db):
    """Try to create an assay upload"""
    new_user = Users.create(PROFILE)

    metadata_patch = {PROTOCOL_ID_FIELD_NAME: TRIAL_ID}
    gcs_xlsx_uri = "xlsx/assays/wes/12:0:1.5123095"

    TrialMetadata.create(TRIAL_ID, METADATA)

    new_job = UploadJobs.create(
        prism.SUPPORTED_MANIFESTS[0], EMAIL, None, metadata_patch, gcs_xlsx_uri
    )
    assert list(new_job.upload_uris_with_data_uris_with_uuids()) == []

    job = UploadJobs.find_by_id_and_email(new_job.id, PROFILE["email"])
    assert list(job.upload_uris_with_data_uris_with_uuids()) == []
def test_assay_or_analysis_preconditions(monkeypatch):
    """Ensure derive_files_from_assay_or_analysis_upload blocks derivation under the expected conditions."""
    find_by_id = MagicMock()
    find_by_id.return_value = None
    monkeypatch.setattr(upload_postprocessing.UploadJobs, "find_by_id",
                        find_by_id)

    with pytest.raises(Exception, match="No upload record with id"):
        upload_postprocessing.derive_files_from_assay_or_analysis_upload(
            event, None)

    find_by_id.return_value = upload_job = UploadJobs(trial_id="foo")
    upload_job._set_status_no_validation(UploadJobStatus.MERGE_FAILED.value)

    with pytest.raises(Exception, match="status is merge-failed"):
        upload_postprocessing.derive_files_from_assay_or_analysis_upload(
            event, None)

    upload_job._set_status_no_validation(UploadJobStatus.MERGE_COMPLETED.value)

    # Ensure that file derivation happens so long as upload record exists
    _derive_files = MagicMock()
    monkeypatch.setattr(upload_postprocessing, "_derive_files_from_upload",
                        _derive_files)

    upload_postprocessing.derive_files_from_assay_or_analysis_upload(
        event, None)
    _derive_files.assert_called()
示例#7
0
def derive_files_from_assay_or_analysis_upload(event: dict,
                                               context: BackgroundContext):
    """
    Generate derivative files from an assay or analysis upload.
    """
    upload_id = extract_pubsub_data(event)

    with sqlalchemy_session() as session:
        upload_record: UploadJobs = UploadJobs.find_by_id(upload_id,
                                                          session=session)

        if not upload_record:
            raise Exception(f"No upload record with id {upload_id} found.")

        if UploadJobStatus(
                upload_record.status) != UploadJobStatus.MERGE_COMPLETED:
            raise Exception(
                f"Cannot perform postprocessing on upload {upload_id}: status is {upload_record.status}"
            )

        print(
            f"Received completed assay/analysis upload {upload_id} for postprocessing."
        )

        # Run the file derivation
        _derive_files_from_upload(upload_record.trial_id,
                                  upload_record.upload_type, session)
def test_poll_upload_merge_status(cidc_api, clean_db, monkeypatch):
    """
    Check pull_upload_merge_status endpoint behavior
    """
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    with cidc_api.app_context():
        user = Users.find_by_id(user_id)
    make_cimac_biofx_user(user_id, cidc_api)

    metadata = {PROTOCOL_ID_FIELD_NAME: trial_id}

    with cidc_api.app_context():
        other_user = Users(email="*****@*****.**")
        other_user.insert()
        upload_job = UploadJobs.create(
            upload_type="wes",
            uploader_email=user.email,
            gcs_file_map={},
            metadata=metadata,
            gcs_xlsx_uri="",
        )
        upload_job.insert()
        upload_job_id = upload_job.id

    client = cidc_api.test_client()

    # Upload not found
    res = client.get(
        f"/ingestion/poll_upload_merge_status/12345?token={upload_job.token}")
    assert res.status_code == 404

    upload_job_url = (
        f"/ingestion/poll_upload_merge_status/{upload_job_id}?token={upload_job.token}"
    )

    # Upload not-yet-ready
    res = client.get(upload_job_url)
    assert res.status_code == 200
    assert "retry_in" in res.json and res.json["retry_in"] == 5
    assert "status" not in res.json

    test_details = "A human-friendly reason for this "
    for status in [
            UploadJobStatus.MERGE_COMPLETED.value,
            UploadJobStatus.MERGE_FAILED.value,
    ]:
        # Simulate cloud function merge status update
        with cidc_api.app_context():
            upload_job._set_status_no_validation(status)
            upload_job.status_details = test_details
            upload_job.update()

        # Upload ready
        res = client.get(upload_job_url)
        assert res.status_code == 200
        assert "retry_in" not in res.json
        assert "status" in res.json and res.json["status"] == status
        assert ("status_details" in res.json
                and res.json["status_details"] == test_details)
示例#9
0
def test_assay_upload_merge_extra_metadata(clean_db, monkeypatch):
    """Try to create an assay upload"""
    new_user = Users.create(PROFILE)

    TrialMetadata.create(TRIAL_ID, METADATA)

    assay_upload = UploadJobs.create(
        upload_type="assay_with_extra_md",
        uploader_email=EMAIL,
        gcs_file_map={},
        metadata={
            PROTOCOL_ID_FIELD_NAME: TRIAL_ID,
            "whatever": {
                "hierarchy": [
                    {"we just need a": "uuid-1", "to be able": "to merge"},
                    {"and": "uuid-2"},
                ]
            },
        },
        gcs_xlsx_uri="",
        commit=False,
    )
    assay_upload.id = 111
    clean_db.commit()

    custom_extra_md_parse = MagicMock()
    custom_extra_md_parse.side_effect = lambda f: {"extra": f.read().decode()}
    monkeypatch.setattr(
        "cidc_schemas.prism.merger.EXTRA_METADATA_PARSERS",
        {"assay_with_extra_md": custom_extra_md_parse},
    )

    UploadJobs.merge_extra_metadata(
        111,
        {
            "uuid-1": io.BytesIO(b"within extra md file 1"),
            "uuid-2": io.BytesIO(b"within extra md file 2"),
        },
        session=clean_db,
    )

    assert 1 == clean_db.query(UploadJobs).count()
    au = clean_db.query(UploadJobs).first()
    assert "extra" in au.metadata_patch["whatever"]["hierarchy"][0]
    assert "extra" in au.metadata_patch["whatever"]["hierarchy"][1]
示例#10
0
def test_requires_upload_token_auth(cidc_api, clean_db, monkeypatch):
    """Check that the requires_upload_token_auth decorator works as expected"""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    job_id = setup_upload_jobs(cidc_api)[0]
    with cidc_api.app_context():
        job = UploadJobs.find_by_id(job_id)

    test_route = "/foobarfoo"

    @requires_upload_token_auth
    def endpoint(*args, **kwargs):
        assert "upload_job" in kwargs
        return "ok", 200

    query_route = f"{test_route}/{job_id}"
    nonexistent_job_id = "9999999"

    # User must provide `token` query param
    with cidc_api.test_request_context(query_route):
        with pytest.raises(UnprocessableEntity) as e:
            endpoint(upload_job=job_id)
        assert e._excinfo[1].data["messages"]["query"]["token"] == [
            "Missing data for required field."
        ]

    # User must provide correct `token` query param
    with cidc_api.test_request_context(f"{query_route}?token={'bad token'}"):
        with pytest.raises(Unauthorized,
                           match="upload_job token authentication failed"):
            endpoint(upload_job=job_id)

    with cidc_api.test_request_context(f"{query_route}?token={job.token}"):
        assert endpoint(upload_job=job_id) == ("ok", 200)

    # User whose id token authentication succeeds gets a 404 if the relevant job doesn't exist
    with cidc_api.test_request_context(
            f"{test_route}/{nonexistent_job_id}?token={job.token}"):
        with pytest.raises(NotFound):
            endpoint(upload_job=nonexistent_job_id)

    monkeypatch.setattr(
        "cidc_api.resources.upload_jobs.authenticate_and_get_user",
        lambda *args, **kwargs: None,
    )

    # User whose id token authentication fails can still successfully authenticate
    # using an upload token.
    with cidc_api.test_request_context(f"{query_route}?token={job.token}"):
        assert endpoint(upload_job=job_id) == ("ok", 200)

    # User whose id token authentication fails gets a 401 if the relevant job doesn't exist
    with cidc_api.test_request_context(
            f"{test_route}/{nonexistent_job_id}?token={job.token}"):
        with pytest.raises(Unauthorized,
                           match="upload_job token authentication failed"):
            endpoint(upload_job=nonexistent_job_id)
示例#11
0
def setup_db_records(cidc_api):
    extra = {"_etag": ETAG}
    with cidc_api.app_context():
        Users(**users["json"], **extra).insert(compute_etag=False)
        TrialMetadata(**trial_metadata["json"], **extra).insert(compute_etag=False)
        DownloadableFiles(**downloadable_files["json"], **extra).insert(
            compute_etag=False
        )
        Permissions(**permissions["json"], **extra).insert(compute_etag=False)
        UploadJobs(**upload_jobs["json"], **extra).insert(compute_etag=False)
示例#12
0
def test_new_upload_alert(monkeypatch):
    vals = {"id": 1, "trial_id": "foo", "uploader_email": "*****@*****.**"}

    gen_confs = MagicMock()
    gen_confs.side_effect = (
        lambda ct, patch, template_type, bucket: {"attach.file": "content"}
        if "wes" in template_type
        else {}
    )
    monkeypatch.setattr(
        "cidc_api.shared.emails.generate_analysis_configs_from_upload_patch", gen_confs
    )

    for upload, full_ct, expected_att in [
        (
            UploadJobs(
                **vals, upload_type="wes_bam", metadata_patch={"assays": {"wes": []}}
            ),
            {"assays": {"wes": []}},
            [
                {
                    "content": "Y29udGVudA==",  # "content" base64 encoded
                    "filename": "attach.file",
                    "type": "application/yaml",
                }
            ],
        ),
        (UploadJobs(**vals, upload_type="pbmc", metadata_patch={}), {}, None),
    ]:
        email = new_upload_alert(upload, full_ct)
        assert "UPLOAD SUCCESS" in email["subject"]
        assert email["to_emails"] == [CIDC_MAILING_LIST]
        for val in vals.values():
            assert str(val) in email["html_content"]

        assert gen_confs.called_once()

        assert email.get("attachments") == expected_att
示例#13
0
def test_create_assay_upload(clean_db):
    """Try to create an assay upload"""
    new_user = Users.create(PROFILE)

    gcs_file_map = {
        "my/first/wes/blob1/2019-08-30T15:51:38.450978": "test-uuid-1",
        "my/first/wes/blob2/2019-08-30T15:51:38.450978": "test-uuid-2",
    }
    metadata_patch = {PROTOCOL_ID_FIELD_NAME: TRIAL_ID}
    gcs_xlsx_uri = "xlsx/assays/wes/12:0:1.5123095"

    # Should fail, since trial doesn't exist yet
    with pytest.raises(IntegrityError):
        UploadJobs.create("wes_bam", EMAIL, gcs_file_map, metadata_patch, gcs_xlsx_uri)
    clean_db.rollback()

    TrialMetadata.create(TRIAL_ID, METADATA)

    new_job = UploadJobs.create(
        "wes_bam", EMAIL, gcs_file_map, metadata_patch, gcs_xlsx_uri
    )
    job = UploadJobs.find_by_id_and_email(new_job.id, PROFILE["email"])
    assert len(new_job.gcs_file_map) == len(job.gcs_file_map)
    assert set(new_job.gcs_file_map) == set(job.gcs_file_map)
    assert job.status == "started"

    assert list(job.upload_uris_with_data_uris_with_uuids()) == [
        (
            "my/first/wes/blob1/2019-08-30T15:51:38.450978",
            "my/first/wes/blob1",
            "test-uuid-1",
        ),
        (
            "my/first/wes/blob2/2019-08-30T15:51:38.450978",
            "my/first/wes/blob2",
            "test-uuid-2",
        ),
    ]
示例#14
0
def setup_upload_jobs(cidc_api) -> Tuple[int, int]:
    """
    Insert two uploads into the database created by different users
    and return their IDs.
    """
    with cidc_api.app_context():
        other_user = Users(email="*****@*****.**")
        other_user.insert()

        job1 = UploadJobs(
            uploader_email=user_email,
            trial_id=trial_id,
            status=UploadJobStatus.STARTED.value,
            metadata_patch={
                "test": {
                    "upload_placeholder": "baz"
                },
                "test2": "foo"
            },
            upload_type="",
            gcs_xlsx_uri="",
            gcs_file_map={"bip": "baz"},
            multifile=False,
        )
        job2 = UploadJobs(
            uploader_email=other_user.email,
            trial_id=trial_id,
            status=UploadJobStatus.STARTED.value,
            metadata_patch={
                "array": [{
                    "upload_placeholder": "baz"
                }, {
                    "test2": "foo"
                }]
            },
            upload_type="",
            gcs_xlsx_uri="",
            gcs_file_map={"bip": "baz"},
            multifile=False,
        )

        job1.insert()
        job2.insert()

        return job1.id, job2.id
示例#15
0
def derive_files_from_manifest_upload(event: dict, context: BackgroundContext):
    """
    Generate derivative files from a manifest upload.
    """
    upload_id = extract_pubsub_data(event)

    with sqlalchemy_session() as session:
        upload_record: UploadJobs = UploadJobs.find_by_id(upload_id,
                                                          session=session)
        if not upload_record:
            raise Exception(
                f"No manifest upload record found with id {upload_id}.")

        print(
            f"Received completed manifest upload {upload_id} for postprocessing."
        )

        # Run the file derivation
        _derive_files_from_upload(upload_record.trial_id,
                                  upload_record.upload_type, session)
示例#16
0
def test_upload_olink(cidc_api, clean_db, monkeypatch):
    """Ensure the upload endpoint follows the expected execution flow"""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    with cidc_api.app_context():
        user = Users.find_by_id(user_id)

    make_cimac_biofx_user(user_id, cidc_api)

    client = cidc_api.test_client()

    mocks = UploadMocks(
        monkeypatch,
        prismify_file_entries=[
            finfo(lp, url, "uuid" + str(i), "npx" in url, False)
            for i, (lp, url) in enumerate(OLINK_TESTDATA)
        ],
    )

    # No permission to upload yet
    res = client.post(ASSAY_UPLOAD,
                      data=form_data("olink.xlsx", io.BytesIO(b"1234"),
                                     "olink"))
    assert res.status_code == 401
    assert "not authorized to upload olink data" in str(
        res.json["_error"]["message"])

    mocks.clear_all()

    # Give permission and retry
    grant_upload_permission(user_id, "olink", cidc_api)

    res = client.post(ASSAY_UPLOAD,
                      data=form_data("olink.xlsx", io.BytesIO(b"1234"),
                                     "olink"))
    assert res.status_code == 200

    assert "url_mapping" in res.json
    url_mapping = res.json["url_mapping"]

    # Olink assay has extra_metadata files
    assert "extra_metadata" in res.json
    extra_metadata = res.json["extra_metadata"]
    assert type(extra_metadata) == dict

    # We expect local_path to map to a gcs object name with gcs_prefix.
    for local_path, gcs_prefix in OLINK_TESTDATA:
        gcs_object_name = url_mapping[local_path]
        assert local_path in url_mapping
        assert gcs_object_name.startswith(gcs_prefix)
        assert (local_path not in gcs_object_name
                ), "PHI from local_path shouldn't end up in gcs urls"

    # Check that we tried to grant IAM upload access to gcs_object_name
    mocks.grant_write.assert_called_with(user.email)

    # Check that we tried to upload the assay metadata excel file
    mocks.upload_xlsx.assert_called_once()

    job_id = res.json["job_id"]
    update_url = f"/upload_jobs/{job_id}"

    # Report an upload failure
    res = client.patch(
        f"{update_url}?token={res.json['token']}",
        json={"status": UploadJobStatus.UPLOAD_FAILED.value},
        headers={"If-Match": res.json["job_etag"]},
    )
    assert res.status_code == 200
    mocks.revoke_write.assert_called_with(user.email)
    # This was an upload failure, so success shouldn't have been published
    mocks.publish_success.assert_not_called()

    # Test upload status validation - since the upload job's current status
    # is UPLOAD_FAILED, the API shouldn't permit this status to be updated to
    # UPLOAD_COMPLETED.
    bad_res = client.patch(
        f"{update_url}?token={res.json['token']}",
        json={"status": UploadJobStatus.UPLOAD_COMPLETED.value},
        headers={"If-Match": res.json["_etag"]},
    )
    assert bad_res.status_code == 400
    assert ("status upload-failed can't transition to status upload-completed"
            in bad_res.json["_error"]["message"])

    # Reset the upload status and try the request again
    with cidc_api.app_context():
        job = UploadJobs.find_by_id_and_email(job_id, user.email)
        job._set_status_no_validation(UploadJobStatus.STARTED.value)
        job.update()
        _etag = job._etag

    res = client.patch(
        f"{update_url}?token={res.json['token']}",
        json={"status": UploadJobStatus.UPLOAD_COMPLETED.value},
        headers={"If-Match": _etag},
    )
    assert res.status_code == 200
    mocks.publish_success.assert_called_with(job_id)
示例#17
0
def test_upload_wes(cidc_api, clean_db, monkeypatch):
    """Ensure the upload endpoint follows the expected execution flow"""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    make_cimac_biofx_user(user_id, cidc_api)
    with cidc_api.app_context():
        user = Users.find_by_id(user_id)

    client = cidc_api.test_client()

    mocks = UploadMocks(
        monkeypatch,
        prismify_file_entries=[
            finfo("localfile.ext", "test_trial/url/file.ext", "uuid-1", None,
                  False)
        ],
    )

    # No permission to upload yet
    res = client.post(ASSAY_UPLOAD,
                      data=form_data("wes.xlsx", io.BytesIO(b"1234"),
                                     "wes_fastq"))
    assert res.status_code == 401
    assert "not authorized to upload wes_fastq data" in str(
        res.json["_error"]["message"])

    mocks.clear_all()

    # Give permission and retry
    grant_upload_permission(user_id, "wes_fastq", cidc_api)

    res = client.post(ASSAY_UPLOAD,
                      data=form_data("wes.xlsx", io.BytesIO(b"1234"),
                                     "wes_fastq"))
    assert res.status_code == 200
    assert "url_mapping" in res.json
    url_mapping = res.json["url_mapping"]

    # WES assay does not have any extra_metadata files, but its (and every assay's) response
    # should have an extra_metadata field.
    assert "extra_metadata" in res.json
    extra_metadata = res.json["extra_metadata"]
    assert extra_metadata is None

    # We expect local_path to map to a gcs object name with gcs_prefix
    local_path = "localfile.ext"
    gcs_prefix = "test_trial/url/file.ext"
    gcs_object_name = url_mapping[local_path]
    assert local_path in url_mapping
    assert gcs_object_name.startswith(gcs_prefix)
    assert not gcs_object_name.endswith(
        local_path), "PHI from local_path shouldn't end up in gcs urls"

    # Check that we tried to grant IAM upload access to gcs_object_name
    mocks.grant_write.assert_called_with(user.email)

    # Check that we tried to upload the assay metadata excel file
    mocks.upload_xlsx.assert_called_once()

    job_id = res.json["job_id"]
    update_url = f"/upload_jobs/{job_id}"

    # Report an upload failure
    res = client.patch(
        f"{update_url}?token={res.json['token']}",
        json={"status": UploadJobStatus.UPLOAD_FAILED.value},
        headers={"If-Match": res.json["job_etag"]},
    )
    assert res.status_code == 200
    mocks.revoke_write.assert_called_with(user.email)
    # This was an upload failure, so success shouldn't have been published
    mocks.publish_success.assert_not_called()

    # Reset the upload status and try the request again
    with cidc_api.app_context():
        job = UploadJobs.find_by_id_and_email(job_id, user.email)
        job._set_status_no_validation(UploadJobStatus.STARTED.value)
        job.update()
        _etag = job._etag

    # Report an upload success
    res = client.patch(
        f"{update_url}?token={res.json['token']}",
        json={"status": UploadJobStatus.UPLOAD_COMPLETED.value},
        headers={"If-Match": _etag},
    )
    assert res.status_code == 200
    mocks.publish_success.assert_called_with(job_id)
示例#18
0
def test_update_upload_job(cidc_api, clean_db, monkeypatch):
    """Check that getting a updating an upload job by ID works as expected."""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    user_job, other_job = setup_upload_jobs(cidc_api)
    with cidc_api.app_context():
        user_job_record = UploadJobs.find_by_id(user_job)
        other_job_record = UploadJobs.find_by_id(other_job)

    publish_success = MagicMock()
    monkeypatch.setattr("cidc_api.shared.gcloud_client.publish_upload_success",
                        publish_success)
    revoke_upload_access = MagicMock()
    monkeypatch.setattr("cidc_api.shared.gcloud_client.revoke_upload_access",
                        revoke_upload_access)

    client = cidc_api.test_client()

    # Possible patches
    upload_success = {"status": UploadJobStatus.UPLOAD_COMPLETED.value}
    upload_failure = {"status": UploadJobStatus.UPLOAD_FAILED.value}
    invalid_update = {"status": UploadJobStatus.MERGE_COMPLETED.value}

    # A user gets error if they fail to provide an upload token
    res = client.patch(f"/upload_jobs/{other_job}", json=upload_success)
    assert res.status_code == 422
    publish_success.assert_not_called()
    revoke_upload_access.assert_not_called()

    # A user gets an authentication error if they provide an incorrect upload token
    res = client.patch(
        f"/upload_jobs/{other_job}?token=nope",
        headers={"if-match": other_job_record._etag},
        json=upload_success,
    )
    assert res.status_code == 401
    assert res.json["_error"][
        "message"] == "upload_job token authentication failed"
    publish_success.assert_not_called()
    revoke_upload_access.assert_not_called()

    # A user gets an error if they try to update something besides the job's status
    res = client.patch(
        f"/upload_jobs/{other_job}?token={other_job_record.token}",
        headers={"if-match": other_job_record._etag},
        json={
            "uploader_email": "*****@*****.**",
            "status": ""
        },
    )
    assert res.status_code == 422
    assert res.json["_error"]["message"]["uploader_email"][
        0] == "Unknown field."

    # A user providing a correct token can update their job's status to be a failure
    res = client.patch(
        f"/upload_jobs/{other_job}?token={other_job_record.token}",
        headers={"if-match": other_job_record._etag},
        json={
            "gcs_file_map": {
                "foo": "bar"
            },
            **upload_failure
        },
    )
    assert res.status_code == 200
    publish_success.assert_not_called()
    revoke_upload_access.assert_called_once()
    revoke_upload_access.reset_mock()

    with cidc_api.app_context():
        modified_job = UploadJobs.find_by_id(other_job)
        assert modified_job.metadata_patch == {"array": [{"test2": "foo"}]}
        user_job_record._set_status_no_validation(
            UploadJobStatus.STARTED.value)
        user_job_record.update()

    # A user can update a job to be a success
    # Also allows for updating the gcs_file_map and thereby the metadata_patch
    res = client.patch(
        f"/upload_jobs/{user_job}?token={user_job_record.token}",
        headers={"if-match": user_job_record._etag},
        json={
            "gcs_file_map": {
                "foo": "bar"
            },
            **upload_success
        },
    )
    assert res.status_code == 200
    publish_success.assert_called_once_with(user_job)
    revoke_upload_access.assert_called_once()
    with cidc_api.app_context():
        modified_job = UploadJobs.find_by_id(user_job)
        assert modified_job.gcs_file_map == {"foo": "bar"}
        assert modified_job.metadata_patch == {"test2": "foo"}

    publish_success.reset_mock()
    revoke_upload_access.reset_mock()

    with cidc_api.app_context():
        user_job_record._set_status_no_validation(
            UploadJobStatus.STARTED.value)
        user_job_record.update()

    # Users can't make an illegal state transition
    res = client.patch(
        f"/upload_jobs/{user_job}?token={user_job_record.token}",
        headers={"if-match": user_job_record._etag},
        json=invalid_update,
    )
    assert res.status_code == 400
示例#19
0
def ingest_upload(event: dict, context: BackgroundContext):
    """
    When a successful upload event is published, move the data associated
    with the upload job into the download bucket and merge the upload metadata
    into the appropriate clinical trial JSON.
    """
    storage_client = storage.Client()

    job_id = int(extract_pubsub_data(event))

    logger.info(f"ingest_upload execution started on upload job id {job_id}")

    with sqlalchemy_session() as session:
        job: UploadJobs = UploadJobs.find_by_id(job_id, session=session)

        # Check ingestion pre-conditions
        if not job:
            raise Exception(f"No assay upload job with id {job_id} found.")
        if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED:
            raise Exception(
                f"Received ID for job with status {job.status}. Aborting ingestion."
            )
        trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME)
        if not trial_id:
            # We should never hit this, since metadata should be pre-validated.
            with saved_failure_status(job, session):
                raise Exception(
                    f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})."
                )

        logger.info(
            f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}"
        )

        url_bundles = [
            URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids()
        ]

        # Copy GCS blobs in parallel
        logger.info("Copying artifacts from upload bucket to data bucket.")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status(
            job, session
        ):
            destination_objects = executor.map(
                lambda url_bundle: _gcs_copy(
                    storage_client,
                    GOOGLE_UPLOAD_BUCKET,
                    url_bundle.upload_url,
                    GOOGLE_DATA_BUCKET,
                    url_bundle.target_url,
                ),
                url_bundles,
            )

        metadata_patch = job.metadata_patch
        logger.info("Adding artifact metadata to metadata patch.")
        metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts(
            metadata_patch,
            job.upload_type,
            zip([ub.artifact_uuid for ub in url_bundles], destination_objects),
        )

        # Add metadata for this upload to the database
        logger.info(
            "Merging metadata from upload %d into trial %s: " % (job.id, trial_id),
            metadata_patch,
        )
        with saved_failure_status(job, session):
            trial = TrialMetadata.patch_assays(
                trial_id, metadata_patch, session=session
            )

        # Save downloadable files to the database
        # NOTE: this needs to happen after TrialMetadata.patch_assays
        # in order to avoid violating a foreign-key constraint on the trial_id
        # in the event that this is the first upload for a trial.
        logger.info("Saving artifact records to the downloadable_files table.")
        for artifact_metadata, additional_metadata in downloadable_files:
            logger.debug(
                f"Saving metadata to downloadable_files table: {artifact_metadata}"
            )
            DownloadableFiles.create_from_metadata(
                trial_id,
                job.upload_type,
                artifact_metadata,
                additional_metadata=additional_metadata,
                session=session,
                commit=False,
            )

        # Additionally, make the metadata xlsx a downloadable file
        with saved_failure_status(job, session):
            _, xlsx_blob = _get_bucket_and_blob(
                storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri
            )
            full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}"
            data_format = "Assay Metadata"
            facet_group = f"{job.upload_type}|{data_format}"
            logger.info(f"Saving {full_uri} as a downloadable_file.")
            DownloadableFiles.create_from_blob(
                trial_id,
                job.upload_type,
                data_format,
                facet_group,
                xlsx_blob,
                session=session,
            )

        # Update the job metadata to include artifacts
        job.metadata_patch = metadata_patch

        # Making files downloadable by a specified biofx analysis team group
        assay_prefix = job.upload_type.split("_")[0]  # 'wes_bam' -> 'wes'
        if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT:
            analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix]
            _gcs_add_prefix_reader_permission(
                storage_client,
                analysis_group_email,  # to whom give access to
                f"{trial_id}/{assay_prefix}",  # to what sub-folder
            )

        # Save the upload success and trigger email alert if transaction succeeds
        job.ingestion_success(trial, session=session, send_email=True, commit=True)

        # Trigger post-processing on uploaded data files
        logger.info(f"Publishing object URLs to 'artifact_upload' topic")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor:
            executor.map(
                lambda url_bundle: publish_artifact_upload(url_bundle.target_url),
                url_bundles,
            )

        # Trigger post-processing on entire upload
        report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC)
        if report:
            report.result()

    # Google won't actually do anything with this response; it's
    # provided for testing purposes only.
    return jsonify(
        dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles)
    )
示例#20
0
def test_migrations_failures(use_upload_jobs_table, monkeypatch):
    """Test that changes get rolled back in potential failure scenarios."""
    # Mock alembic
    monkeypatch.setattr(migrations, "op", MagicMock())

    # Mock sqlalchemy
    mock_session_builder = MagicMock()
    mock_session = MagicMock()
    mock_session_builder.return_value = mock_session
    monkeypatch.setattr(migrations, "Session", mock_session_builder)

    # Mock cidc_api and prism functions
    trial_record = MagicMock()
    select_trials = MagicMock()
    select_trials.return_value = [trial_record]
    monkeypatch.setattr(migrations, "_select_trials", select_trials)

    df_record = MagicMock()
    select_df = MagicMock()
    select_df.return_value = df_record
    monkeypatch.setattr(DownloadableFiles, "get_by_object_url", select_df)

    select_assay_uploads = MagicMock()
    select_assay_uploads.return_value = [
        UploadJobs(gcs_file_map={
            "a_old_url/ts": "foo",
            "b_old_url/ts": "bar"
        })
    ]
    monkeypatch.setattr(migrations, "_select_successful_assay_uploads",
                        select_assay_uploads)

    if not use_upload_jobs_table:
        select_manifest_uploads = MagicMock()
        select_manifest_uploads.return_value = [MagicMock()]
        monkeypatch.setattr(migrations, "_select_manifest_uploads",
                            select_manifest_uploads)

    new_metadata = {
        "some_assay": {
            "extra":
            "metadata",
            "files": [{
                "upload_placeholder": "foo"
            }, {
                "upload_placeholder": "bar"
            }],
        }
    }
    mock_migration = MagicMock()
    mock_migration.return_value = MigrationResult(
        new_metadata,
        {
            "a_old_url": {
                "object_url": "a_new_url",
                "upload_placeholder": "foo"
            },
            "b_old_url": {
                "object_url": "b_new_url",
                "upload_placeholder": "bar"
            },
        },
    )

    rename_gcs_obj = MagicMock()
    monkeypatch.setattr(migrations, "rename_gcs_blob", rename_gcs_obj)

    def reset_mocks():
        rename_gcs_obj.reset_mock()
        mock_session.commit.reset_mock()
        mock_session.rollback.reset_mock()
        mock_session.close.reset_mock()

    # GCS failure config
    rename_gcs_obj.side_effect = [None, Exception("gcs failure"), None]

    with pytest.raises(Exception, match="gcs failure"):
        run_metadata_migration(mock_migration, use_upload_jobs_table)
    # Called 3 times - task 1 succeeds, task 2 fails, task 1 rolls back
    assert len(rename_gcs_obj.call_args_list) == 3
    mock_session.commit.assert_not_called()
    mock_session.rollback.assert_called_once()
    mock_session.close.assert_called_once()

    rename_gcs_obj.side_effect = None
    reset_mocks()

    # SQL failure
    select_assay_uploads.side_effect = Exception("sql failure")

    with pytest.raises(Exception, match="sql failure"):
        run_metadata_migration(mock_migration, use_upload_jobs_table)
    mock_session.commit.assert_not_called()
    mock_session.rollback.assert_called_once()
    mock_session.close.assert_called_once()
    # Ensure no GCS operations were carried out
    rename_gcs_obj.assert_not_called()

    reset_mocks()

    # No failures
    select_assay_uploads.side_effect = None
    run_metadata_migration(mock_migration, use_upload_jobs_table)
    # Ensure we updated trials as expected
    trial_record.safely_set_metadata_json.assert_called_with(new_metadata)
    # Ensure we updated files as expected
    assert df_record.object_url == "b_new_url"
    assert df_record.additional_metadata == {"some_assay.extra": "metadata"}
    # Ensure we renamed the right objects
    assert rename_gcs_obj.call_args_list == [
        call(GOOGLE_DATA_BUCKET, "a_old_url", "a_new_url"),
        call(GOOGLE_DATA_BUCKET, "b_old_url", "b_new_url"),
    ]
示例#21
0
def setup_data(cidc_api, clean_db):
    user = Users(email="*****@*****.**", approval_date=datetime.now())
    shipment = {
        "courier": "FEDEX",
        "ship_to": "",
        "ship_from": "",
        "assay_type": assay_type,
        "manifest_id": manifest_id,
        "date_shipped": "2020-06-10 00:00:00",
        "date_received": "2020-06-11 00:00:00",
        "account_number": "",
        "assay_priority": "1",
        "receiving_party": "MSSM_Rahman",
        "tracking_number": "",
        "shipping_condition": "Frozen_Dry_Ice",
        "quality_of_shipment": "Specimen shipment received in good condition",
    }
    metadata = {
        "protocol_identifier":
        trial_id,
        "shipments": [
            # we get duplicate shipment uploads sometimes
            shipment,
            shipment,
        ],
        "participants": [{
            "cimac_participant_id":
            f"CTTTPP{p}",
            "participant_id":
            "x",
            "cohort_name":
            "",
            "samples": [{
                "cimac_id": f"CTTTPP{p}SS.0{s}",
                "sample_location": "",
                "type_of_primary_container": "Other",
                "type_of_sample": "Other",
                "collection_event_name": "",
                "parent_sample_id": "",
            } for s in range(num_samples[p])],
        } for p in range(num_participants)],
        "allowed_cohort_names": [""],
        "allowed_collection_event_names": [""],
    }
    trial = TrialMetadata(trial_id=trial_id, metadata_json=metadata)
    upload_job = UploadJobs(
        uploader_email=user.email,
        trial_id=trial.trial_id,
        upload_type="pbmc",
        gcs_xlsx_uri="",
        metadata_patch=metadata,
        multifile=False,
    )
    upload_job._set_status_no_validation(UploadJobStatus.MERGE_COMPLETED.value)
    with cidc_api.app_context():
        user.insert()
        trial.insert()
        upload_job.insert()

        clean_db.refresh(user)
        clean_db.refresh(upload_job)
        clean_db.refresh(trial)

    return user, upload_job, trial
示例#22
0
def test_ingest_upload(caplog, monkeypatch):
    """Test upload data transfer functionality"""

    TS_AND_PATH = "/1234/local_path1.txt"
    ARTIFACT = {"test-prop": "test-val"}
    TRIAL_ID = "CIMAC-12345"

    job = UploadJobs(
        id=JOB_ID,
        uploader_email="*****@*****.**",
        trial_id=TRIAL_ID,
        gcs_xlsx_uri="test.xlsx",
        gcs_file_map=FILE_MAP,
        metadata_patch={
            prism.PROTOCOL_ID_FIELD_NAME: TRIAL_ID,
            "assays": {
                "wes": [{
                    "records": [{
                        "cimac_id": "CIMAC-mock-sa-id",
                        "files": {
                            "r1": {
                                "upload_placeholder": "uuid1"
                            },
                            "r2": {
                                "upload_placeholder": "uuid2"
                            },
                        },
                    }]
                }]
            },
        },
        status=UploadJobStatus.UPLOAD_COMPLETED.value,
        upload_type="wes_bam",
    )

    # Since the test database isn't yet set up with migrations,
    # it won't have the correct relations in it, so we can't actually
    # store or retrieve data
    find_by_id = MagicMock()
    find_by_id.return_value = job
    monkeypatch.setattr(UploadJobs, "find_by_id", find_by_id)

    # Mock data transfer functionality
    _gcs_copy = MagicMock()
    _gcs_copy.side_effect = lambda storage_client, source_bucket, source_object, target_bucket, target_object: _gcs_obj_mock(
        target_object,
        100,
        datetime.datetime.now(),
        "gsc_url_mock_md5",
        "gsc_url_mock_crc32c",
    )
    monkeypatch.setattr("functions.uploads._gcs_copy", _gcs_copy)

    _get_bucket_and_blob = MagicMock()
    xlsx_blob = MagicMock()
    _get_bucket_and_blob.return_value = None, xlsx_blob
    monkeypatch.setattr("functions.uploads._get_bucket_and_blob",
                        _get_bucket_and_blob)

    monkeypatch.setattr(
        "functions.uploads.GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT",
        {"wes": "analysis-group@email"},
    )

    # mocking `google.cloud.storage.Client()` to not actually create a client
    _storage_client = MagicMock("_storage_client")
    monkeypatch.setattr("functions.uploads.storage.Client",
                        lambda *a, **kw: _storage_client)

    _bucket = MagicMock("_bucket")
    _storage_client.get_bucket = lambda *a, **kw: _bucket

    _storage_client._connection = _connection = MagicMock("_connection")

    _api_request = _connection.api_request = MagicMock(
        "_connection.api_request")
    _api_request.return_value = {"bindings": []}

    _bucket.set_iam_policy = _set_iam_policy = MagicMock(
        "_bucket.set_iam_policy")
    _bucket.get_iam_policy = _get_iam_policy = MagicMock(
        "_bucket.get_iam_policy")
    _policy = _get_iam_policy.return_value = MagicMock("_policy")
    iam_prefix = f'resource.name.startsWith("projects/_/buckets/cidc-data-staging/objects/{TRIAL_ID}/wes/")'
    # This set up checks handling duplicate bindings
    _policy.bindings = [{
        "role": GOOGLE_ANALYSIS_GROUP_ROLE,
        "members": {f"group:analysis-group@email"},
        "condition": {
            "expression": iam_prefix
        },
    }]

    # Mock metadata merging functionality
    _save_file = MagicMock("_save_file")
    monkeypatch.setattr(DownloadableFiles, "create_from_metadata", _save_file)

    _save_blob_file = MagicMock("_save_blob_file")
    monkeypatch.setattr(DownloadableFiles, "create_from_blob", _save_blob_file)

    _merge_metadata = MagicMock("_merge_metadata")
    monkeypatch.setattr(TrialMetadata, "patch_assays", _merge_metadata)

    publish_artifact_upload = MagicMock("publish_artifact_upload")
    monkeypatch.setattr(uploads, "publish_artifact_upload",
                        publish_artifact_upload)

    _encode_and_publish = MagicMock("_encode_and_publish")
    monkeypatch.setattr(uploads, "_encode_and_publish", _encode_and_publish)

    successful_upload_event = make_pubsub_event(str(job.id))
    response = ingest_upload(successful_upload_event, None).json

    assert response[URI1 + UPLOAD_DATE_PATH] == URI1
    assert response[URI2 + UPLOAD_DATE_PATH] == URI2
    find_by_id.assert_called_once()
    # Check that we copied multiple objects
    _gcs_copy.assert_called() and not _gcs_copy.assert_called_once()
    # Check that we tried to save multiple files
    _save_file.assert_called() and not _save_file.assert_called_once()
    # Check that we tried to merge metadata once
    _merge_metadata.assert_called_once()
    # Check that we got the xlsx blob metadata from GCS
    _get_bucket_and_blob.assert_called_with(_storage_client,
                                            GOOGLE_DATA_BUCKET,
                                            job.gcs_xlsx_uri)
    # Check that we created a downloadable file for the xlsx file blob
    assert _save_blob_file.call_args[:-1][0] == (
        "CIMAC-12345",
        "wes_bam",
        "Assay Metadata",
        "wes_bam|Assay Metadata",
        xlsx_blob,
    )
    # Check that we tried to update GCS access policy
    _set_iam_policy.assert_called_once()
    # Check that we aded GCS access for biofx team
    assert _policy == _set_iam_policy.call_args[0][0]
    assert len(_policy.bindings) == 1
    assert _policy.bindings[0]["members"] == {"group:analysis-group@email"}
    assert _policy.bindings[0][
        "role"] == "projects/cidc-dfci-staging/roles/CIDC_biofx"
    assert iam_prefix in _policy.bindings[0]["condition"]["expression"]
    _until = datetime.datetime.today() + datetime.timedelta(
        GOOGLE_ANALYSIS_PERMISSIONS_GRANT_FOR_DAYS)
    assert (
        f'request.time < timestamp("{_until.date().isoformat()}T00:00:00Z")'
        in _policy.bindings[0]["condition"]["expression"])

    # Check that the job status was updated to reflect a successful upload
    assert job.status == UploadJobStatus.MERGE_COMPLETED.value
    assert email_was_sent(caplog.text)
    publish_artifact_upload.assert_called()
    _encode_and_publish.assert_called()