Exemplo n.º 1
0
    def __init__(
        self,
        hostname: str,
        auth: Gen3Auth,
        download_list: List[Downloadable],
    ):
        """
        Initialize the DownloadManager so that is ready to start downloading.
        Note the downloadable objects are required so that all tokens are available
        to support the download.

        Args:
            hostname (str): Gen3 commons home commons
            auth (Gen3Auth) : Gen3 authentication
            download_list (List[Downloadable]): list of objects to download
        """

        self.hostname = hostname
        self.access_token = auth.get_access_token()
        self.metadata = Gen3Metadata(auth)
        self.wts_endpoints = wts_external_oidc(hostname)
        self.resolved_compact_drs = {}
        # add COMMONS host as a DRSEndpoint as it does not use the WTS
        self.known_hosts = {
            self.hostname:
            KnownDRSEndpoint(
                hostname=self.hostname,
                access_token=self.access_token,
                use_wts=False,
                expire=datetime.fromtimestamp(
                    decode_token(self.access_token)["exp"]),
            )
        }
        self.download_list = download_list
        self.resolve_objects(self.download_list)
Exemplo n.º 2
0
def test_batch_create(requests_mock):
    """
    Test batch creation
    """
    metadata = Gen3Metadata("https://example.com")
    metadata_list = [
        {"guid": "3c42c819-1dfe-4c3e-8d46-c3ec7eb99bf4", "data": {"foo": "bar"}},
        {"guid": "dfa1a1dc-98f4-46be-ba8f-ae9b42b0ee50", "data": {"foo": "bar"}},
    ]
    expected_response = {
        "created": [
            "3c42c819-1dfe-4c3e-8d46-c3ec7eb99bf4",
            "dfa1a1dc-98f4-46be-ba8f-ae9b42b0ee50",
        ],
        "updated": [],
        "conflict": [],
    }

    def _mock_request(url, **kwargs):
        assert f"/metadata" in url

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.json.return_value = expected_response
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.batch_create(metadata_list)

    assert response == expected_response
Exemplo n.º 3
0
def test_query_full_metadata(requests_mock):
    """
    Test querying for guids with full data
    """
    metadata = Gen3Metadata("https://example.com")
    expected_response = {
        "1cfd6767-7775-4e0d-a4a7-d0fc9db02e1d": {
            "dbgap": {
                "sex": "male",
                "body_site": "Blood",
                "repository": "TOPMed_WGS_Amish",
                "sample_use": [],
                "analyte_type": "DNA",
                "biosample_id": "SAMN04109653",
                "consent_code": 2,
                "dbgap_status": "Loaded",
                "sra_sample_id": "SRS1305029",
                "dbgap_sample_id": 1784123,
                "study_accession": "phs000956.v3.p1",
                "dbgap_subject_id": 1360617,
                "sra_data_details": {
                    "runs": "1",
                    "bases": "145891962638",
                    "center": "UM-TOPMed",
                    "status": "public",
                    "size_Gb": "24",
                    "platform": "ILLUMINA",
                    "experiments": "1",
                    "experiment_type": "WGS",
                },
                "study_subject_id": "phs000956.v3_DBG00256",
                "consent_short_name": "HMB-IRB-MDS",
                "study_with_consent": "phs000956.c2",
                "submitted_sample_id": "NWD299344",
                "submitted_subject_id": "DBG00256",
                "study_accession_with_consent": "phs000956.v3.p1.c2",
            },
            "_guid_type": "indexed_file_object",
        }
    }

    def _mock_request(url, **kwargs):
        assert f"/metadata" in url
        assert f"foo.bar=fizzbuzz" in url
        assert f"data=True" in url

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.json.return_value = expected_response
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.query("foo.bar=fizzbuzz", return_full_metadata=True)

    assert response == expected_response
Exemplo n.º 4
0
def try_delete_discovery_guid(auth, guid):
    """
    Deletes all discovery metadata under [guid] if it exists
    """
    mds = Gen3Metadata(auth_provider=auth)
    try:
        metadata = mds.get(guid)
        if metadata["_guid_type"] == "discovery_metadata":
            mds.delete(guid)
        else:
            logging.warning(f"{guid} is not discovery metadata. Skipping.")
    except requests.exceptions.HTTPError as e:
        logging.warning(e)
Exemplo n.º 5
0
def test_get_index_key_paths_error(requests_mock):
    """
    Test getting key paths error
    """
    metadata = Gen3Metadata("https://example.com")

    def _mock_request(url, **kwargs):
        assert url.endswith("/metadata_index")

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 500
        mocked_response.json.return_value = {"error": "blah"}
        mocked_response.raise_for_status.side_effect = HTTPError("uh oh")

        return mocked_response

    requests_mock.side_effect = _mock_request

    with pytest.raises(Exception):
        response = metadata.get_index_key_paths()
Exemplo n.º 6
0
async def _update_metadata(guid, metadata, auth, commons_url, lock):
    """
    Gets a semaphore then updates metadata for guid

    Args:
        guid (str): indexd record globally unique id
        metadata (str): the metadata to add
        auth (Gen3Auth): Gen3 auth or tuple with basic auth name and password
        commons_url (str): root domain for commons where metadata service lives
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
    """
    mds = Gen3Metadata(commons_url, auth_provider=auth)
    async with lock:
        # default ssl handling unless it's explicitly http://
        ssl = None
        if "https" not in commons_url:
            ssl = False

        response = await mds.async_update(guid, metadata, _ssl=ssl)
        return response
Exemplo n.º 7
0
async def _get_record_from_mds(guid, commons_url, lock):
    """
    Gets a semaphore then requests a record for the given guid

    Args:
        guid (str): mds record globally unique id
        commons_url (str): root domain for commons where mds lives
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
    """
    mds = Gen3Metadata(commons_url)
    async with lock:
        # default ssl handling unless it's explicitly http://
        ssl = None
        if "https" not in commons_url:
            ssl = False

        try:
            return await mds.async_get(guid, _ssl=ssl)
        except aiohttp.client_exceptions.ClientResponseError:
            return None
Exemplo n.º 8
0
def test_create_index_key_paths_error(requests_mock):
    """
    Test create index key paths error
    """
    metadata = Gen3Metadata("https://example.com")
    path = "/blah"

    def _mock_request(url, **kwargs):
        assert f"/metadata_index/{path}" in url

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 500
        mocked_response.json.return_value = {"error": "blah"}
        mocked_response.raise_for_status.side_effect = HTTPError("uh oh")

        return mocked_response

    requests_mock.side_effect = _mock_request

    with pytest.raises(Exception):
        response = metadata.create_index_key_path(path)
Exemplo n.º 9
0
def test_get_index_key_paths(requests_mock):
    """
    Test getting index key paths
    """
    metadata = Gen3Metadata("https://example.com")
    expected_response = ["abc"]

    def _mock_request(url, **kwargs):
        assert url.endswith("/metadata_index")

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.json.return_value = expected_response
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.get_index_key_paths()

    assert response == expected_response
Exemplo n.º 10
0
def test_get_version(requests_mock):
    """
    Test getting version
    """
    metadata = Gen3Metadata("https://example.com")

    def _mock_request(url, **kwargs):
        assert url.endswith("/version")

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.text = "1.2.0"
        mocked_response.json.return_value = {}
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.get_version()

    assert response
Exemplo n.º 11
0
def test_is_not_healthy(requests_mock):
    """
    Test is not healthy response
    """
    metadata = Gen3Metadata("https://example.com")

    def _mock_request(url, **kwargs):
        assert url.endswith("/_status")

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 500
        mocked_response.text = "Not Healthy"
        mocked_response.json.return_value = {}
        mocked_response.raise_for_status.side_effect = HTTPError("uh oh")

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.is_healthy()

    assert not response
Exemplo n.º 12
0
def test_delete_index_key_path(requests_mock):
    """
    Test deleting the index key path
    """
    metadata = Gen3Metadata("https://example.com")
    path = "/blah"
    expected_response = {}

    def _mock_request(url, **kwargs):
        assert f"/metadata_index/{path}" in url

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 204
        mocked_response.json.return_value = expected_response
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.delete_index_key_path(path)

    assert response.status_code == 204
Exemplo n.º 13
0
def test_delete(requests_mock):
    """
    Test deleting guids
    """
    metadata = Gen3Metadata("https://example.com")
    guid = "95a41871-244c-48ae-8004-63f4ed1f0291"
    expected_response = {}

    def _mock_request(url, **kwargs):
        assert f"/metadata/{guid}" in url

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.json.return_value = expected_response
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.delete(guid=guid)

    assert response == expected_response
Exemplo n.º 14
0
def test_query(requests_mock):
    """
    Test querying for guids
    """
    metadata = Gen3Metadata("https://example.com")
    expected_response = ["1cfd6767-7775-4e0d-a4a7-d0fc9db02e1d"]

    def _mock_request(url, **kwargs):
        assert f"/metadata" in url
        assert f"foo.bar=fizzbuzz" in url
        assert f"data=False" in url

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.json.return_value = expected_response
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.query("foo.bar=fizzbuzz")

    assert response == expected_response
Exemplo n.º 15
0
def test_update(requests_mock):
    """
    Test updating for guids
    """
    metadata = Gen3Metadata("https://example.com")
    guid = "95a41871-244c-48ae-8004-63f4ed1f0291"
    data = {"foo": "bar", "fizz": "buzz", "nested_details": {"key1": "value1"}}
    expected_response = data

    def _mock_request(url, **kwargs):
        assert f"/metadata/{guid}" in url

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.json.return_value = expected_response
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.update(guid=guid, metadata=data)

    assert response == expected_response
Exemplo n.º 16
0
def test_is_healthy(requests_mock):
    """
    Test is healthy response
    """
    metadata = Gen3Metadata("https://example.com")

    def _mock_request(url, **kwargs):
        assert url.endswith("/_status")

        mocked_response = MagicMock(requests.Response)
        mocked_response.status_code = 200
        mocked_response.json.return_value = {
            "status": "OK",
            "timestamp": "2020-03-13T15:23:53.765568+00:00",
        }
        mocked_response.raise_for_status.side_effect = lambda *args: None

        return mocked_response

    requests_mock.side_effect = _mock_request

    response = metadata.is_healthy()

    assert response
Exemplo n.º 17
0
async def output_expanded_discovery_metadata(auth,
                                             endpoint=None,
                                             limit=500,
                                             use_agg_mds=False):
    """
    fetch discovery metadata from a commons and output to {commons}-discovery-metadata.tsv
    """
    if endpoint:
        mds = Gen3Metadata(
            auth_provider=auth,
            endpoint=endpoint,
            service_location="mds/aggregate" if use_agg_mds else "mds",
        )
    else:
        mds = Gen3Metadata(
            auth_provider=auth,
            service_location="mds/aggregate" if use_agg_mds else "mds",
        )

    count = 0
    with tempfile.TemporaryDirectory() as metadata_cache_dir:
        all_fields = set()
        num_tags = 0

        for offset in range(0, limit, MAX_GUIDS_PER_REQUEST):
            partial_metadata = mds.query(
                "_guid_type=discovery_metadata",
                return_full_metadata=True,
                limit=min(limit, MAX_GUIDS_PER_REQUEST),
                offset=offset,
                use_agg_mds=use_agg_mds,
            )

            # if agg MDS we will flatten the results as they are in "common" : dict format
            # However this can result in duplicates as the aggregate mds is namespaced to
            # handle this, therefore prefix the commons in front of the guid
            if use_agg_mds:
                partial_metadata = {
                    f"{c}__{i}": d
                    for c, y in partial_metadata.items() for x in y
                    for i, d in x.items()
                }

            if len(partial_metadata):
                for guid, guid_metadata in partial_metadata.items():
                    with open(f"{metadata_cache_dir}/{guid.replace('/', '_')}",
                              "w+") as cached_guid_file:
                        guid_discovery_metadata = guid_metadata[
                            "gen3_discovery"]
                        json.dump(guid_discovery_metadata, cached_guid_file)
                        all_fields |= set(guid_discovery_metadata.keys())
                        num_tags = max(
                            num_tags,
                            len(guid_discovery_metadata.get("tags", [])))
            else:
                break

        output_columns = (["guid"]
                          # "tags" is flattened to _tag_0 through _tag_n
                          + sorted(list(all_fields - set(["tags"]))) +
                          [f"_tag_{n}" for n in range(num_tags)])
        base_schema = {column: "" for column in output_columns}

        output_filename = _metadata_file_from_auth(auth)
        with open(
                output_filename,
                "w+",
        ) as output_file:
            writer = csv.DictWriter(
                output_file,
                **{
                    **BASE_CSV_PARSER_SETTINGS, "fieldnames": output_columns
                },
            )
            writer.writeheader()

            for guid in sorted(os.listdir(metadata_cache_dir)):
                with open(f"{metadata_cache_dir}/{guid}") as f:
                    fetched_metadata = json.load(f)
                    flattened_tags = {
                        f"_tag_{tag_num}": f"{tag['category']}: {tag['name']}"
                        for tag_num, tag in enumerate(
                            fetched_metadata.pop("tags", []))
                    }

                    true_guid = guid
                    if use_agg_mds:
                        true_guid = guid.split("__")[1]
                    output_metadata = _sanitize_tsv_row({
                        **base_schema,
                        **fetched_metadata,
                        **flattened_tags,
                        "guid":
                        true_guid,
                    })
                    writer.writerow(output_metadata)

        return output_filename
Exemplo n.º 18
0
async def publish_discovery_metadata(auth,
                                     metadata_filename,
                                     endpoint=None,
                                     omit_empty_values=False):
    """
    Publish discovery metadata from a tsv file
    """
    if endpoint:
        mds = Gen3Metadata(auth_provider=auth, endpoint=endpoint)
    else:
        mds = Gen3Metadata(auth_provider=auth)

    if not metadata_filename:
        metadata_filename = _metadata_file_from_auth(auth)

    delimiter = "," if metadata_filename.endswith(".csv") else "\t"

    with open(metadata_filename) as metadata_file:
        metadata_reader = csv.DictReader(
            metadata_file, **{
                **BASE_CSV_PARSER_SETTINGS, "delimiter": delimiter
            })
        tag_columns = [
            column for column in metadata_reader.fieldnames
            if "_tag_" in column
        ]
        pending_requests = []

        for metadata_line in metadata_reader:
            discovery_metadata = {
                key: _try_parse(value)
                for key, value in metadata_line.items()
            }

            if len(tag_columns):
                # all columns _tag_0 -> _tag_n are pushed to a "tags" column
                coalesced_tags = [{
                    "name": tag_name.strip(),
                    "category": tag_category.strip()
                } for tag_category, tag_name in [
                    tag.split(":")
                    for tag in map(discovery_metadata.pop, tag_columns)
                    if tag != ""
                ]]
                discovery_metadata["tags"] = coalesced_tags

            guid = discovery_metadata.pop("guid")

            if omit_empty_values:
                discovery_metadata = {
                    key: value
                    for key, value in discovery_metadata.items()
                    if value not in ["", [], {}]
                }

            metadata = {
                "_guid_type": "discovery_metadata",
                "gen3_discovery": discovery_metadata,
            }

            pending_requests += [
                mds.async_create(guid, metadata, overwrite=True)
            ]
            if len(pending_requests) == MAX_CONCURRENT_REQUESTS:
                await asyncio.gather(*pending_requests)
                pending_requests = []

        await asyncio.gather(*pending_requests)