def _patch_client_for_direct_access(client: DSSClient): old_get_file = client.get_file old_get_bundle = client.get_bundle mini_dss = MiniDSS(config.dss_endpoint) def new_get_file(self, uuid, replica, version=None): assert client is self try: blob = mini_dss.get_file(uuid, version, replica) except Exception: logger.warning( 'Failed getting file %s, version %s directly. ' 'Falling back to official method', uuid, version) return old_get_file(uuid=uuid, version=version, replica=replica) else: return blob class NewGetBundle: def paginate(self, *args, **kwargs): uuid, version, replica = kwargs['uuid'], kwargs['version'], kwargs[ 'replica'] try: bundle = mini_dss.get_bundle(uuid, version, replica) except Exception: logger.warning( 'Failed getting bundle file %s, version %s directly. ' 'Falling back to official method', uuid, version) return old_get_bundle.paginate(*args, **kwargs) else: page = {'bundle': bundle, 'version': version, 'uuid': uuid} return [page] new_get_bundle = NewGetBundle() client.get_file = types.MethodType(new_get_file, client) client.get_bundle = new_get_bundle
class DataStoreAgent: DSS_SWAGGER_URL_TEMPLATE = "https://dss.{deployment}.data.humancellatlas.org/v1/swagger.json" DSS_PROD_SWAGGER_URL = "https://dss.data.humancellatlas.org/v1/swagger.json" def __init__(self, deployment): self.deployment = deployment if self.deployment == "prod": swagger_url = self.DSS_PROD_SWAGGER_URL else: swagger_url = self.DSS_SWAGGER_URL_TEMPLATE.format( deployment=deployment) self.client = DSSClient(swagger_url=swagger_url) def search(self, query, replica='aws'): try: response = self.client.post_search(replica=replica, es_query=query) return response['results'] except SwaggerAPIException: return [] def search_iterate(self, query, replica='aws'): for hit in self.client.post_search.iterate(replica=replica, es_query=query): yield hit def download_bundle(self, bundle_uuid, target_folder): Progress.report(f"Downloading bundle {bundle_uuid}:\n") manifest = self.bundle_manifest(bundle_uuid) bundle_folder = os.path.join(target_folder, bundle_uuid) try: os.makedirs(bundle_folder) except FileExistsError: pass for f in manifest['bundle']['files']: self.download_file(f['uuid'], save_as=os.path.join(bundle_folder, f['name'])) return bundle_folder def bundle_manifest(self, bundle_uuid, replica='aws'): return self.client.get_bundle(replica=replica, uuid=bundle_uuid) def download_file(self, file_uuid, save_as, replica='aws'): Progress.report(f"Downloading file {file_uuid} to {save_as}\n") with self.client.get_file.stream(replica=replica, uuid=file_uuid) as fh: with open(save_as, "wb") as f: while True: chunk = fh.raw.read(1024) if chunk: f.write(chunk) else: break def tombstone_bundle(self, bundle_uuid, replica='aws'): self.client.delete_bundle(replica=replica, uuid=bundle_uuid, reason="DCP-wide integration test")
from hca import HCAConfig from hca.dss import DSSClient hca_config = HCAConfig() hca_config[ 'DSSClient'].swagger_url = f'https://dss.dev.data.humancellatlas.org/v1/swagger.json' dss = DSSClient(config=hca_config) for i in dss.post_search.iterate(replica='aws', es_query={}): uuid, version = i['bundle_fqid'].split('.', 1) try: s = f'Bundle: {uuid}.{version}\n' for j in dss.get_bundle(replica='aws', uuid=uuid, version=version)['bundle']['files']: file_version = j['version'] file_uuid = j['uuid'] s += f' File: {file_uuid}.{file_version}\n' print(s[:-1]) break except: pass # print(f'Does not exist: {uuid}.{version}')
def fetch_bundle(): dss = DSSClient() return dss.get_bundle(replica="aws", uuid="002aeac5-4d74-462d-baea-88f5c620cb50", version="2019-08-01T200147.836900Z")
def download_bundle_metadata( client: DSSClient, replica: str, uuid: str, version: Optional[str] = None, directurls: bool = False, presignedurls: bool = False, num_workers: Optional[int] = None) -> Tuple[str, List[JSON], JSON]: """ Download the metadata for a given bundle from the HCA data store (DSS). :param client: A DSS API client instance :param replica: The name of the DSS replica to use :param uuid: The UUID of the bundle in DSS :param version: The version of the bundle. if None, the most recent version of the bundle will be downloaded. :param directurls: Whether to include direct-access URLs in the response. This is mutually exclusive with the presignedurls parameter. Note: including `directurls` and `presignedurls` in the function call will cause the DSS to copy metadata and data files in the bundle to another bucket first. That could be time-consuming and/or inefficient for users who only want to work with the metadata instead of the files. It is very likely `directurls` and `presignedurls` will be removed or changed in the future. :param presignedurls: A boolean controls whether to include presigned URLs in the response. This is mutually exclusive with the directurls parameter. Note this parameter, similar to the `directurls`, is a temporary parameter, and it's not guaranteed to stay in this place in the future. :param num_workers: The size of the thread pool to use for downloading metadata files in parallel. If None, the default pool size will be used, typically a small multiple of the number of cores on the system executing this function. If 0, no thread pool will be used and all files will be downloaded sequentially by the current thread. :return: A tuple consisting of the version of the downloaded bundle, a list of the manifest entries for all files in the bundle (data and metadata) and a dictionary mapping the file name of each metadata file in the bundle to the JSON contents of that file. """ if directurls or presignedurls: logger.warning( "PendingDeprecationWarning: `directurls` and `presignedurls` are temporary parameters and not" " guaranteed to stay in the code base in the future!") logger.debug("Getting bundle %s.%s from DSS.", uuid, version) # noinspection PyUnresolvedReferences response = client.get_bundle(uuid=uuid, version=version, replica=replica, directurls=directurls, presignedurls=presignedurls) bundle = response['bundle'] manifest = bundle['files'] metadata_files = {f["name"]: f for f in manifest if f["indexed"]} def download_file(item): file_name, manifest_entry = item file_uuid = manifest_entry['uuid'] file_version = manifest_entry['version'] logger.debug("Getting file '%s' (%s.%s) from DSS.", file_name, file_uuid, file_version) # noinspection PyUnresolvedReferences return file_name, client.get_file(uuid=file_uuid, version=file_version, replica='aws') if num_workers == 0: metadata_files = map(download_file, metadata_files.items()) else: with ThreadPoolExecutor(num_workers) as tpe: metadata_files = tpe.map(download_file, metadata_files.items()) return bundle['version'], manifest, dict(metadata_files)