def __init__( self, hostname: str, auth: Gen3Auth, download_list: List[Downloadable], ): """ Initialize the DownloadManager so that is ready to start downloading. Note the downloadable objects are required so that all tokens are available to support the download. Args: hostname (str): Gen3 commons home commons auth (Gen3Auth) : Gen3 authentication download_list (List[Downloadable]): list of objects to download """ self.hostname = hostname self.access_token = auth.get_access_token() self.metadata = Gen3Metadata(auth) self.wts_endpoints = wts_external_oidc(hostname) self.resolved_compact_drs = {} # add COMMONS host as a DRSEndpoint as it does not use the WTS self.known_hosts = { self.hostname: KnownDRSEndpoint( hostname=self.hostname, access_token=self.access_token, use_wts=False, expire=datetime.fromtimestamp( decode_token(self.access_token)["exp"]), ) } self.download_list = download_list self.resolve_objects(self.download_list)
def test_batch_create(requests_mock): """ Test batch creation """ metadata = Gen3Metadata("https://example.com") metadata_list = [ {"guid": "3c42c819-1dfe-4c3e-8d46-c3ec7eb99bf4", "data": {"foo": "bar"}}, {"guid": "dfa1a1dc-98f4-46be-ba8f-ae9b42b0ee50", "data": {"foo": "bar"}}, ] expected_response = { "created": [ "3c42c819-1dfe-4c3e-8d46-c3ec7eb99bf4", "dfa1a1dc-98f4-46be-ba8f-ae9b42b0ee50", ], "updated": [], "conflict": [], } def _mock_request(url, **kwargs): assert f"/metadata" in url mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.json.return_value = expected_response mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.batch_create(metadata_list) assert response == expected_response
def test_query_full_metadata(requests_mock): """ Test querying for guids with full data """ metadata = Gen3Metadata("https://example.com") expected_response = { "1cfd6767-7775-4e0d-a4a7-d0fc9db02e1d": { "dbgap": { "sex": "male", "body_site": "Blood", "repository": "TOPMed_WGS_Amish", "sample_use": [], "analyte_type": "DNA", "biosample_id": "SAMN04109653", "consent_code": 2, "dbgap_status": "Loaded", "sra_sample_id": "SRS1305029", "dbgap_sample_id": 1784123, "study_accession": "phs000956.v3.p1", "dbgap_subject_id": 1360617, "sra_data_details": { "runs": "1", "bases": "145891962638", "center": "UM-TOPMed", "status": "public", "size_Gb": "24", "platform": "ILLUMINA", "experiments": "1", "experiment_type": "WGS", }, "study_subject_id": "phs000956.v3_DBG00256", "consent_short_name": "HMB-IRB-MDS", "study_with_consent": "phs000956.c2", "submitted_sample_id": "NWD299344", "submitted_subject_id": "DBG00256", "study_accession_with_consent": "phs000956.v3.p1.c2", }, "_guid_type": "indexed_file_object", } } def _mock_request(url, **kwargs): assert f"/metadata" in url assert f"foo.bar=fizzbuzz" in url assert f"data=True" in url mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.json.return_value = expected_response mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.query("foo.bar=fizzbuzz", return_full_metadata=True) assert response == expected_response
def try_delete_discovery_guid(auth, guid): """ Deletes all discovery metadata under [guid] if it exists """ mds = Gen3Metadata(auth_provider=auth) try: metadata = mds.get(guid) if metadata["_guid_type"] == "discovery_metadata": mds.delete(guid) else: logging.warning(f"{guid} is not discovery metadata. Skipping.") except requests.exceptions.HTTPError as e: logging.warning(e)
def test_get_index_key_paths_error(requests_mock): """ Test getting key paths error """ metadata = Gen3Metadata("https://example.com") def _mock_request(url, **kwargs): assert url.endswith("/metadata_index") mocked_response = MagicMock(requests.Response) mocked_response.status_code = 500 mocked_response.json.return_value = {"error": "blah"} mocked_response.raise_for_status.side_effect = HTTPError("uh oh") return mocked_response requests_mock.side_effect = _mock_request with pytest.raises(Exception): response = metadata.get_index_key_paths()
async def _update_metadata(guid, metadata, auth, commons_url, lock): """ Gets a semaphore then updates metadata for guid Args: guid (str): indexd record globally unique id metadata (str): the metadata to add auth (Gen3Auth): Gen3 auth or tuple with basic auth name and password commons_url (str): root domain for commons where metadata service lives lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http connections """ mds = Gen3Metadata(commons_url, auth_provider=auth) async with lock: # default ssl handling unless it's explicitly http:// ssl = None if "https" not in commons_url: ssl = False response = await mds.async_update(guid, metadata, _ssl=ssl) return response
async def _get_record_from_mds(guid, commons_url, lock): """ Gets a semaphore then requests a record for the given guid Args: guid (str): mds record globally unique id commons_url (str): root domain for commons where mds lives lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http connections """ mds = Gen3Metadata(commons_url) async with lock: # default ssl handling unless it's explicitly http:// ssl = None if "https" not in commons_url: ssl = False try: return await mds.async_get(guid, _ssl=ssl) except aiohttp.client_exceptions.ClientResponseError: return None
def test_create_index_key_paths_error(requests_mock): """ Test create index key paths error """ metadata = Gen3Metadata("https://example.com") path = "/blah" def _mock_request(url, **kwargs): assert f"/metadata_index/{path}" in url mocked_response = MagicMock(requests.Response) mocked_response.status_code = 500 mocked_response.json.return_value = {"error": "blah"} mocked_response.raise_for_status.side_effect = HTTPError("uh oh") return mocked_response requests_mock.side_effect = _mock_request with pytest.raises(Exception): response = metadata.create_index_key_path(path)
def test_get_index_key_paths(requests_mock): """ Test getting index key paths """ metadata = Gen3Metadata("https://example.com") expected_response = ["abc"] def _mock_request(url, **kwargs): assert url.endswith("/metadata_index") mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.json.return_value = expected_response mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.get_index_key_paths() assert response == expected_response
def test_get_version(requests_mock): """ Test getting version """ metadata = Gen3Metadata("https://example.com") def _mock_request(url, **kwargs): assert url.endswith("/version") mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.text = "1.2.0" mocked_response.json.return_value = {} mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.get_version() assert response
def test_is_not_healthy(requests_mock): """ Test is not healthy response """ metadata = Gen3Metadata("https://example.com") def _mock_request(url, **kwargs): assert url.endswith("/_status") mocked_response = MagicMock(requests.Response) mocked_response.status_code = 500 mocked_response.text = "Not Healthy" mocked_response.json.return_value = {} mocked_response.raise_for_status.side_effect = HTTPError("uh oh") return mocked_response requests_mock.side_effect = _mock_request response = metadata.is_healthy() assert not response
def test_delete_index_key_path(requests_mock): """ Test deleting the index key path """ metadata = Gen3Metadata("https://example.com") path = "/blah" expected_response = {} def _mock_request(url, **kwargs): assert f"/metadata_index/{path}" in url mocked_response = MagicMock(requests.Response) mocked_response.status_code = 204 mocked_response.json.return_value = expected_response mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.delete_index_key_path(path) assert response.status_code == 204
def test_delete(requests_mock): """ Test deleting guids """ metadata = Gen3Metadata("https://example.com") guid = "95a41871-244c-48ae-8004-63f4ed1f0291" expected_response = {} def _mock_request(url, **kwargs): assert f"/metadata/{guid}" in url mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.json.return_value = expected_response mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.delete(guid=guid) assert response == expected_response
def test_query(requests_mock): """ Test querying for guids """ metadata = Gen3Metadata("https://example.com") expected_response = ["1cfd6767-7775-4e0d-a4a7-d0fc9db02e1d"] def _mock_request(url, **kwargs): assert f"/metadata" in url assert f"foo.bar=fizzbuzz" in url assert f"data=False" in url mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.json.return_value = expected_response mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.query("foo.bar=fizzbuzz") assert response == expected_response
def test_update(requests_mock): """ Test updating for guids """ metadata = Gen3Metadata("https://example.com") guid = "95a41871-244c-48ae-8004-63f4ed1f0291" data = {"foo": "bar", "fizz": "buzz", "nested_details": {"key1": "value1"}} expected_response = data def _mock_request(url, **kwargs): assert f"/metadata/{guid}" in url mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.json.return_value = expected_response mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.update(guid=guid, metadata=data) assert response == expected_response
def test_is_healthy(requests_mock): """ Test is healthy response """ metadata = Gen3Metadata("https://example.com") def _mock_request(url, **kwargs): assert url.endswith("/_status") mocked_response = MagicMock(requests.Response) mocked_response.status_code = 200 mocked_response.json.return_value = { "status": "OK", "timestamp": "2020-03-13T15:23:53.765568+00:00", } mocked_response.raise_for_status.side_effect = lambda *args: None return mocked_response requests_mock.side_effect = _mock_request response = metadata.is_healthy() assert response
async def output_expanded_discovery_metadata(auth, endpoint=None, limit=500, use_agg_mds=False): """ fetch discovery metadata from a commons and output to {commons}-discovery-metadata.tsv """ if endpoint: mds = Gen3Metadata( auth_provider=auth, endpoint=endpoint, service_location="mds/aggregate" if use_agg_mds else "mds", ) else: mds = Gen3Metadata( auth_provider=auth, service_location="mds/aggregate" if use_agg_mds else "mds", ) count = 0 with tempfile.TemporaryDirectory() as metadata_cache_dir: all_fields = set() num_tags = 0 for offset in range(0, limit, MAX_GUIDS_PER_REQUEST): partial_metadata = mds.query( "_guid_type=discovery_metadata", return_full_metadata=True, limit=min(limit, MAX_GUIDS_PER_REQUEST), offset=offset, use_agg_mds=use_agg_mds, ) # if agg MDS we will flatten the results as they are in "common" : dict format # However this can result in duplicates as the aggregate mds is namespaced to # handle this, therefore prefix the commons in front of the guid if use_agg_mds: partial_metadata = { f"{c}__{i}": d for c, y in partial_metadata.items() for x in y for i, d in x.items() } if len(partial_metadata): for guid, guid_metadata in partial_metadata.items(): with open(f"{metadata_cache_dir}/{guid.replace('/', '_')}", "w+") as cached_guid_file: guid_discovery_metadata = guid_metadata[ "gen3_discovery"] json.dump(guid_discovery_metadata, cached_guid_file) all_fields |= set(guid_discovery_metadata.keys()) num_tags = max( num_tags, len(guid_discovery_metadata.get("tags", []))) else: break output_columns = (["guid"] # "tags" is flattened to _tag_0 through _tag_n + sorted(list(all_fields - set(["tags"]))) + [f"_tag_{n}" for n in range(num_tags)]) base_schema = {column: "" for column in output_columns} output_filename = _metadata_file_from_auth(auth) with open( output_filename, "w+", ) as output_file: writer = csv.DictWriter( output_file, **{ **BASE_CSV_PARSER_SETTINGS, "fieldnames": output_columns }, ) writer.writeheader() for guid in sorted(os.listdir(metadata_cache_dir)): with open(f"{metadata_cache_dir}/{guid}") as f: fetched_metadata = json.load(f) flattened_tags = { f"_tag_{tag_num}": f"{tag['category']}: {tag['name']}" for tag_num, tag in enumerate( fetched_metadata.pop("tags", [])) } true_guid = guid if use_agg_mds: true_guid = guid.split("__")[1] output_metadata = _sanitize_tsv_row({ **base_schema, **fetched_metadata, **flattened_tags, "guid": true_guid, }) writer.writerow(output_metadata) return output_filename
async def publish_discovery_metadata(auth, metadata_filename, endpoint=None, omit_empty_values=False): """ Publish discovery metadata from a tsv file """ if endpoint: mds = Gen3Metadata(auth_provider=auth, endpoint=endpoint) else: mds = Gen3Metadata(auth_provider=auth) if not metadata_filename: metadata_filename = _metadata_file_from_auth(auth) delimiter = "," if metadata_filename.endswith(".csv") else "\t" with open(metadata_filename) as metadata_file: metadata_reader = csv.DictReader( metadata_file, **{ **BASE_CSV_PARSER_SETTINGS, "delimiter": delimiter }) tag_columns = [ column for column in metadata_reader.fieldnames if "_tag_" in column ] pending_requests = [] for metadata_line in metadata_reader: discovery_metadata = { key: _try_parse(value) for key, value in metadata_line.items() } if len(tag_columns): # all columns _tag_0 -> _tag_n are pushed to a "tags" column coalesced_tags = [{ "name": tag_name.strip(), "category": tag_category.strip() } for tag_category, tag_name in [ tag.split(":") for tag in map(discovery_metadata.pop, tag_columns) if tag != "" ]] discovery_metadata["tags"] = coalesced_tags guid = discovery_metadata.pop("guid") if omit_empty_values: discovery_metadata = { key: value for key, value in discovery_metadata.items() if value not in ["", [], {}] } metadata = { "_guid_type": "discovery_metadata", "gen3_discovery": discovery_metadata, } pending_requests += [ mds.async_create(guid, metadata, overwrite=True) ] if len(pending_requests) == MAX_CONCURRENT_REQUESTS: await asyncio.gather(*pending_requests) pending_requests = [] await asyncio.gather(*pending_requests)