def test_bad_content_type(self): uuid = 'bad1bad1-bad1-bad1-bad1-bad1bad1bad1' file_uuid = 'b2216048-7eaa-45f4-8077-5a3fb4204953' file_version = '2018-09-20T232924.687620Z' client = self._mock_get_bundle(file_uuid=file_uuid, file_version=file_version, content_type='bad') with self.assertRaises(NotImplementedError) as cm: # noinspection PyTypeChecker download_bundle_metadata(client, 'aws', uuid) self.assertEqual( cm.exception.args[0], f"Expecting file {file_uuid}.{file_version} " "to have content type 'application/json', not 'bad'")
def test_bad_content(self): uuid = 'bad1bad1-bad1-bad1-bad1-bad1bad1bad1' file_uuid = 'b2216048-7eaa-45f4-8077-5a3fb4204953' file_version = '2018-09-20T232924.687620Z' client = self._mock_get_bundle(file_uuid=file_uuid, file_version=file_version, content_type='application/json') client.get_file.return_value = b'{}' with self.assertRaises(TypeError) as cm: download_bundle_metadata(client, 'aws', uuid) self.assertRegex( cm.exception.args[0], "Expecting file .* to contain a JSON object " + re.escape("(<class 'dict'>), not <class 'bytes'>"))
def test_large_bundle(self): _, manifest, _ = download_bundle_metadata( client=dss_client('prod'), replica='aws', uuid='82164816-64d4-4975-a248-b66c4fdad6f8', version='2019-09-26T054644.254919Z') self.assertEqual(755, len(manifest))
def _test_dss_client(self, direct: bool, query: JSON, dss_client: DSSClient, replica: str, fallback: bool): with self.subTest(direct=direct, replica=replica, fallback=fallback): response = dss_client.post_search(es_query=query, replica=replica, per_page=10) bundle_uuid, _, bundle_version = response['results'][0][ 'bundle_fqid'].partition('.') with mock.patch('azul.dss.logger') as captured_log: _, manifest, metadata = download_bundle_metadata( client=dss_client, replica=replica, uuid=bundle_uuid, version=bundle_version, num_workers=config.num_dss_workers) log.info('Captured log calls: %r', captured_log.mock_calls) self.assertGreater(len(metadata), 0) self.assertGreater(set(f['name'] for f in manifest), set(metadata.keys())) for f in manifest: self.assertIn('s3_etag', f) # Extract the log method name and the first three words of log # message logged. Note that the PyCharm debugger will call # certain dunder methods on the variable, leading to failed # assertions. actual = [(m, ' '.join(re.split(r'[\s,]', a[0])[:3])) for m, a, k in captured_log.mock_calls] if direct: if replica == 'aws': if fallback: expected = [('debug', 'Loading bundle %s'), ('debug', 'Loading object %s'), ('warning', 'Error accessing bundle'), ('warning', 'Failed getting bundle')] + [ ('debug', 'Loading file %s'), ('debug', 'Loading object %s'), ('warning', 'Error accessing file'), ('warning', 'Failed getting file') ] * len(metadata) else: expected = [('debug', 'Loading bundle %s'), ('debug', 'Loading object %s')] + [ ('debug', 'Loading file %s'), ('debug', 'Loading object %s'), # file ('debug', 'Loading object %s') # blob ] * len(metadata) else: # On `gcp` the precondition check fails right away, preventing any attempts of direct access expected = [ ('warning', 'Failed getting bundle') ] + [('warning', 'Failed getting file')] * len(metadata) else: expected = [] self.assertSequenceEqual(sorted(expected), sorted(actual))
def _load_bundle(self, uuid, version, replica='aws', deployment='prod'): """ Load the specified canned bundle, downloading it first if not previously canned """ manifest, metadata_files = self._canned_bundle(deployment, uuid, version) if manifest is None: # pragma: no cover client = dss_client(deployment) _version, manifest, metadata_files = download_bundle_metadata( client, replica, uuid, version) assert _version == version self._can_bundle(os.path.join(deployment), uuid, version, manifest, metadata_files) manifest, metadata_files = self._canned_bundle( deployment, uuid, version) return manifest, metadata_files
def fetch_bundle(self, bundle_fqid: BundleFQID) -> Bundle: now = time.time() # One client per invocation. That's OK because the client will be used # for many requests and a typical lambda invocation calls this only once. dss_client = direct_access_client(num_workers=config.num_dss_workers) version, manifest, metadata_files = download_bundle_metadata( client=dss_client, replica='aws', uuid=bundle_fqid.uuid, version=bundle_fqid.version, num_workers=config.num_dss_workers) bundle = DSSBundle.for_fqid( bundle_fqid, # FIXME: remove need for cast by fixing declaration in metadata API # https://github.com/DataBiosphere/hca-metadata-api/issues/13 manifest=cast(MutableJSONs, manifest), metadata_files=cast(MutableJSON, metadata_files)) assert version == bundle.version log.info("It took %.003fs to download bundle %s.%s", time.time() - now, bundle.uuid, bundle.version) return bundle
def get_bundle_metadata(uuid, version, dss_url, directurls=False): """Factory function to create a `humancellatlas.data.metadata.Bundle` object from bundle information and manifest. Args: bundle_uuid (str): The bundle uuid. bundle_version (str): The bundle version. dss_url (str): Url of Data Storage System to query Returns: humancellatlas.data.metadata.Bundle: A bundle metadata object. """ dss_deployment = dss_url.split('.')[1] if dss_deployment not in ('dev', 'integration', 'staging'): # dss_client constructor defaults to the production deployment client = dss_client() else: client = dss_client(deployment=dss_deployment) version, manifest, metadata_files = download_bundle_metadata( client=client, replica='gcp', uuid=uuid, version=version, directurls=directurls ) return Bundle( uuid=uuid, version=version, manifest=manifest, metadata_files=metadata_files )
def to_json(fqid): uuid, _, version = fqid.partition('.') version, manifest, metadata_files = download_bundle_metadata( client, 'aws', uuid, version, num_workers=0) bundle = Bundle(uuid, version, manifest, metadata_files) return as_json(bundle)
def main(argv): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '--dss-url', '-u', default=config.dss_endpoint, help= 'The URL of the DSS REST API endpoint from which to download the bundle to be canned ' '(default: %(default)s).') parser.add_argument( '--replica', '-r', default='aws', help= "The replica from which to donwload the bundle to be canned (default: %(default)s)." ) parser.add_argument('--uuid', '-b', required=True, help='The UUID of the bundle to can.') parser.add_argument( '--version', '-v', help='The version of the bundle to can (default: the latest version).' ) parser.add_argument( '--output-dir', '-O', default=os.path.join(config.project_root, 'test', 'indexer', 'data'), help='The path to the output directory (default: %(default)s).') parser.add_argument( '--api-json', '-A', default=False, action='store_true', help= "Dump the return value of metadata-api's as_json function (default off)." ) args = parser.parse_args(argv) dss_client = azul.dss.direct_access_client( dss_endpoint=args.dss_url, num_workers=config.num_dss_workers) version, manifest, metadata_files = download_bundle_metadata( client=dss_client, replica=args.replica, uuid=args.uuid, version=args.version, num_workers=config.num_dss_workers) logger.info('Downloaded bundle %s version %s from replica %s.', args.uuid, version, args.replica) api_json = as_json(Bundle(args.uuid, version, manifest, metadata_files)) if args.api_json else None for obj, suffix in [(manifest, ".manifest.json"), (metadata_files, '.metadata.json'), *([(api_json, ".api.json")] if api_json else [])]: path = os.path.join(args.output_dir, args.uuid + suffix) with write_file_atomically(path) as f: json.dump(obj, f, indent=4) logger.info("Successfully wrote %s", path)
def test_one_bundle(self): for deployment, replica, uuid, version, age_range in [ # A v5 bundle (None, 'aws', 'b2216048-7eaa-45f4-8077-5a3fb4204953', None, AgeRange(min=3628800, max=7257600)), # A vx bundle with a cell_suspension as sequencing input ('integration', 'aws', '1e276fdd-d885-4a18-b5b8-df33f1347c1a', '2018-08-03T082009.272868Z', None), # A vx bundle with a specimen_from_organism as sequencing input ('integration', 'aws', '17ef531b-1bb7-425d-bbf7-32721242dde7', '2018-08-17T203538.886280Z', None), ]: with self.subTest(deployment=deployment, replica=replica, uuid=uuid, age_range=age_range): client = dss_client(deployment) version, manifest, metadata_files = download_bundle_metadata( client, replica, uuid, version) bundle = Bundle(uuid, version, manifest, metadata_files) self.assertEqual(str(bundle.uuid), uuid) self.assertEqual(bundle.version, version) self.assertEqual(1, len(bundle.projects)) self.assertEqual({Project}, {type(e) for e in bundle.projects.values()}) root_entities = bundle.root_entities().values() self.assertEqual({DonorOrganism}, {type(e) for e in root_entities}) root_entity = next(iter(root_entities)) self.assertRegex(root_entity.address, 'donor_organism@.*') self.assertIsInstance(root_entity, DonorOrganism) self.assertEqual(root_entity.organism_age_in_seconds, age_range) self.assertTrue(root_entity.sex in ('female', 'unknown')) sequencing_input = bundle.sequencing_input self.assertGreater( len(sequencing_input), 0, "There should be at least one sequencing input") self.assertEqual( len(set(si.document_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should be distinct entities") self.assertEqual( len(set(si.biomaterial_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should have distinct biomaterial IDs") self.assertTrue( all( isinstance(si, Biomaterial) for si in sequencing_input), "All sequencing inputs should be instances of Biomaterial") sequencing_input_schema_names = set(si.schema_name for si in sequencing_input) self.assertTrue({ 'cell_suspension', 'specimen_from_organism' }.issuperset( sequencing_input_schema_names ), "The sequencing inputs in the test bundle are of specific schemas" ) sequencing_output = bundle.sequencing_output self.assertGreater( len(sequencing_output), 0, "There should be at least one sequencing output") self.assertEqual( len(set(so.document_id for so in sequencing_output)), len(sequencing_output), "Sequencing outputs should be distinct entities") self.assertTrue( all( isinstance(so, SequenceFile) for so in sequencing_output), "All sequencing outputs should be instances of SequenceFile" ) self.assertTrue( all( so.manifest_entry.name.endswith('.fastq.gz') for so in sequencing_output), "All sequencing outputs in the test bundle are fastq files." ) print(json.dumps(as_json(bundle), indent=4)) self.assertEqual({SpecimenFromOrganism}, {type(s) for s in bundle.specimens})