Exemplo n.º 1
0
 def test_bad_content_type(self):
     uuid = 'bad1bad1-bad1-bad1-bad1-bad1bad1bad1'
     file_uuid = 'b2216048-7eaa-45f4-8077-5a3fb4204953'
     file_version = '2018-09-20T232924.687620Z'
     client = self._mock_get_bundle(file_uuid=file_uuid,
                                    file_version=file_version,
                                    content_type='bad')
     with self.assertRaises(NotImplementedError) as cm:
         # noinspection PyTypeChecker
         download_bundle_metadata(client, 'aws', uuid)
     self.assertEqual(
         cm.exception.args[0], f"Expecting file {file_uuid}.{file_version} "
         "to have content type 'application/json', not 'bad'")
Exemplo n.º 2
0
 def test_bad_content(self):
     uuid = 'bad1bad1-bad1-bad1-bad1-bad1bad1bad1'
     file_uuid = 'b2216048-7eaa-45f4-8077-5a3fb4204953'
     file_version = '2018-09-20T232924.687620Z'
     client = self._mock_get_bundle(file_uuid=file_uuid,
                                    file_version=file_version,
                                    content_type='application/json')
     client.get_file.return_value = b'{}'
     with self.assertRaises(TypeError) as cm:
         download_bundle_metadata(client, 'aws', uuid)
     self.assertRegex(
         cm.exception.args[0],
         "Expecting file .* to contain a JSON object " +
         re.escape("(<class 'dict'>), not <class 'bytes'>"))
Exemplo n.º 3
0
 def test_large_bundle(self):
     _, manifest, _ = download_bundle_metadata(
         client=dss_client('prod'),
         replica='aws',
         uuid='82164816-64d4-4975-a248-b66c4fdad6f8',
         version='2019-09-26T054644.254919Z')
     self.assertEqual(755, len(manifest))
Exemplo n.º 4
0
    def _test_dss_client(self, direct: bool, query: JSON,
                         dss_client: DSSClient, replica: str, fallback: bool):
        with self.subTest(direct=direct, replica=replica, fallback=fallback):
            response = dss_client.post_search(es_query=query,
                                              replica=replica,
                                              per_page=10)
            bundle_uuid, _, bundle_version = response['results'][0][
                'bundle_fqid'].partition('.')
            with mock.patch('azul.dss.logger') as captured_log:
                _, manifest, metadata = download_bundle_metadata(
                    client=dss_client,
                    replica=replica,
                    uuid=bundle_uuid,
                    version=bundle_version,
                    num_workers=config.num_dss_workers)
            log.info('Captured log calls: %r', captured_log.mock_calls)
            self.assertGreater(len(metadata), 0)
            self.assertGreater(set(f['name'] for f in manifest),
                               set(metadata.keys()))
            for f in manifest:
                self.assertIn('s3_etag', f)
            # Extract the log method name and the first three words of log
            # message logged. Note that the PyCharm debugger will call
            # certain dunder methods on the variable, leading to failed
            # assertions.
            actual = [(m, ' '.join(re.split(r'[\s,]', a[0])[:3]))
                      for m, a, k in captured_log.mock_calls]
            if direct:
                if replica == 'aws':
                    if fallback:
                        expected = [('debug', 'Loading bundle %s'),
                                    ('debug', 'Loading object %s'),
                                    ('warning', 'Error accessing bundle'),
                                    ('warning', 'Failed getting bundle')] + [
                                        ('debug', 'Loading file %s'),
                                        ('debug', 'Loading object %s'),
                                        ('warning', 'Error accessing file'),
                                        ('warning', 'Failed getting file')
                                    ] * len(metadata)
                    else:
                        expected = [('debug', 'Loading bundle %s'),
                                    ('debug', 'Loading object %s')] + [
                                        ('debug', 'Loading file %s'),
                                        ('debug', 'Loading object %s'),  # file
                                        ('debug', 'Loading object %s')  # blob
                                    ] * len(metadata)

                else:
                    # On `gcp` the precondition check fails right away, preventing any attempts of direct access
                    expected = [
                        ('warning', 'Failed getting bundle')
                    ] + [('warning', 'Failed getting file')] * len(metadata)
            else:
                expected = []
            self.assertSequenceEqual(sorted(expected), sorted(actual))
Exemplo n.º 5
0
 def _load_bundle(self, uuid, version, replica='aws', deployment='prod'):
     """
     Load the specified canned bundle, downloading it first if not previously canned
     """
     manifest, metadata_files = self._canned_bundle(deployment, uuid,
                                                    version)
     if manifest is None:  # pragma: no cover
         client = dss_client(deployment)
         _version, manifest, metadata_files = download_bundle_metadata(
             client, replica, uuid, version)
         assert _version == version
         self._can_bundle(os.path.join(deployment), uuid, version, manifest,
                          metadata_files)
         manifest, metadata_files = self._canned_bundle(
             deployment, uuid, version)
     return manifest, metadata_files
Exemplo n.º 6
0
 def fetch_bundle(self, bundle_fqid: BundleFQID) -> Bundle:
     now = time.time()
     # One client per invocation. That's OK because the client will be used
     # for many requests and a typical lambda invocation calls this only once.
     dss_client = direct_access_client(num_workers=config.num_dss_workers)
     version, manifest, metadata_files = download_bundle_metadata(
         client=dss_client,
         replica='aws',
         uuid=bundle_fqid.uuid,
         version=bundle_fqid.version,
         num_workers=config.num_dss_workers)
     bundle = DSSBundle.for_fqid(
         bundle_fqid,
         # FIXME: remove need for cast by fixing declaration in metadata API
         #        https://github.com/DataBiosphere/hca-metadata-api/issues/13
         manifest=cast(MutableJSONs, manifest),
         metadata_files=cast(MutableJSON, metadata_files))
     assert version == bundle.version
     log.info("It took %.003fs to download bundle %s.%s",
              time.time() - now, bundle.uuid, bundle.version)
     return bundle
Exemplo n.º 7
0
def get_bundle_metadata(uuid, version, dss_url, directurls=False):
    """Factory function to create a `humancellatlas.data.metadata.Bundle` object from bundle information and manifest.

    Args:
        bundle_uuid (str): The bundle uuid.
        bundle_version (str): The bundle version.
        dss_url (str): Url of Data Storage System to query

    Returns:
        humancellatlas.data.metadata.Bundle: A bundle metadata object.
    """
    dss_deployment = dss_url.split('.')[1]
    if dss_deployment not in ('dev', 'integration', 'staging'):
        # dss_client constructor defaults to the production deployment
        client = dss_client()
    else:
        client = dss_client(deployment=dss_deployment)
    version, manifest, metadata_files = download_bundle_metadata(
        client=client, replica='gcp', uuid=uuid, version=version, directurls=directurls
    )
    return Bundle(
        uuid=uuid, version=version, manifest=manifest, metadata_files=metadata_files
    )
Exemplo n.º 8
0
 def to_json(fqid):
     uuid, _, version = fqid.partition('.')
     version, manifest, metadata_files = download_bundle_metadata(
         client, 'aws', uuid, version, num_workers=0)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     return as_json(bundle)
Exemplo n.º 9
0
def main(argv):
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--dss-url',
        '-u',
        default=config.dss_endpoint,
        help=
        'The URL of the DSS REST API endpoint from which to download the bundle to be canned '
        '(default: %(default)s).')
    parser.add_argument(
        '--replica',
        '-r',
        default='aws',
        help=
        "The replica from which to donwload the bundle to be canned (default: %(default)s)."
    )
    parser.add_argument('--uuid',
                        '-b',
                        required=True,
                        help='The UUID of the bundle to can.')
    parser.add_argument(
        '--version',
        '-v',
        help='The version of the bundle to can  (default: the latest version).'
    )
    parser.add_argument(
        '--output-dir',
        '-O',
        default=os.path.join(config.project_root, 'test', 'indexer', 'data'),
        help='The path to the output directory (default: %(default)s).')
    parser.add_argument(
        '--api-json',
        '-A',
        default=False,
        action='store_true',
        help=
        "Dump the return value of metadata-api's as_json function (default off)."
    )
    args = parser.parse_args(argv)

    dss_client = azul.dss.direct_access_client(
        dss_endpoint=args.dss_url, num_workers=config.num_dss_workers)
    version, manifest, metadata_files = download_bundle_metadata(
        client=dss_client,
        replica=args.replica,
        uuid=args.uuid,
        version=args.version,
        num_workers=config.num_dss_workers)
    logger.info('Downloaded bundle %s version %s from replica %s.', args.uuid,
                version, args.replica)

    api_json = as_json(Bundle(args.uuid, version, manifest,
                              metadata_files)) if args.api_json else None

    for obj, suffix in [(manifest, ".manifest.json"),
                        (metadata_files, '.metadata.json'),
                        *([(api_json, ".api.json")] if api_json else [])]:
        path = os.path.join(args.output_dir, args.uuid + suffix)
        with write_file_atomically(path) as f:
            json.dump(obj, f, indent=4)
        logger.info("Successfully wrote %s", path)
Exemplo n.º 10
0
    def test_one_bundle(self):
        for deployment, replica, uuid, version, age_range in [
                # A v5 bundle
            (None, 'aws', 'b2216048-7eaa-45f4-8077-5a3fb4204953', None,
             AgeRange(min=3628800, max=7257600)),
                # A vx bundle with a cell_suspension as sequencing input
            ('integration', 'aws', '1e276fdd-d885-4a18-b5b8-df33f1347c1a',
             '2018-08-03T082009.272868Z', None),
                # A vx bundle with a specimen_from_organism as sequencing input
            ('integration', 'aws', '17ef531b-1bb7-425d-bbf7-32721242dde7',
             '2018-08-17T203538.886280Z', None),
        ]:
            with self.subTest(deployment=deployment,
                              replica=replica,
                              uuid=uuid,
                              age_range=age_range):
                client = dss_client(deployment)
                version, manifest, metadata_files = download_bundle_metadata(
                    client, replica, uuid, version)
                bundle = Bundle(uuid, version, manifest, metadata_files)
                self.assertEqual(str(bundle.uuid), uuid)
                self.assertEqual(bundle.version, version)
                self.assertEqual(1, len(bundle.projects))
                self.assertEqual({Project},
                                 {type(e)
                                  for e in bundle.projects.values()})
                root_entities = bundle.root_entities().values()
                self.assertEqual({DonorOrganism},
                                 {type(e)
                                  for e in root_entities})
                root_entity = next(iter(root_entities))
                self.assertRegex(root_entity.address, 'donor_organism@.*')
                self.assertIsInstance(root_entity, DonorOrganism)
                self.assertEqual(root_entity.organism_age_in_seconds,
                                 age_range)
                self.assertTrue(root_entity.sex in ('female', 'unknown'))

                sequencing_input = bundle.sequencing_input
                self.assertGreater(
                    len(sequencing_input), 0,
                    "There should be at least one sequencing input")
                self.assertEqual(
                    len(set(si.document_id for si in sequencing_input)),
                    len(sequencing_input),
                    "Sequencing inputs should be distinct entities")
                self.assertEqual(
                    len(set(si.biomaterial_id for si in sequencing_input)),
                    len(sequencing_input),
                    "Sequencing inputs should have distinct biomaterial IDs")
                self.assertTrue(
                    all(
                        isinstance(si, Biomaterial)
                        for si in sequencing_input),
                    "All sequencing inputs should be instances of Biomaterial")
                sequencing_input_schema_names = set(si.schema_name
                                                    for si in sequencing_input)
                self.assertTrue({
                    'cell_suspension', 'specimen_from_organism'
                }.issuperset(
                    sequencing_input_schema_names
                ), "The sequencing inputs in the test bundle are of specific schemas"
                                )

                sequencing_output = bundle.sequencing_output
                self.assertGreater(
                    len(sequencing_output), 0,
                    "There should be at least one sequencing output")
                self.assertEqual(
                    len(set(so.document_id for so in sequencing_output)),
                    len(sequencing_output),
                    "Sequencing outputs should be distinct entities")
                self.assertTrue(
                    all(
                        isinstance(so, SequenceFile)
                        for so in sequencing_output),
                    "All sequencing outputs should be instances of SequenceFile"
                )
                self.assertTrue(
                    all(
                        so.manifest_entry.name.endswith('.fastq.gz')
                        for so in sequencing_output),
                    "All sequencing outputs in the test bundle are fastq files."
                )

                print(json.dumps(as_json(bundle), indent=4))

                self.assertEqual({SpecimenFromOrganism},
                                 {type(s)
                                  for s in bundle.specimens})