def test_get_dataset_metadata(self): import pbtestdata md = get_dataset_metadata(pbtestdata.get_file("subreads-xml")) assert md.metatype == "PacBio.DataSet.SubreadSet" from pbcore.io import SubreadSet ds = SubreadSet(pbtestdata.get_file("subreads-xml")) assert md.uuid == ds.uuid with pytest.raises(Exception) as e: get_dataset_metadata(None)
def dataset_to_datastore(dataset_file, datastore_file, source_id="dataset_to_datastore"): """Copied from pbcoretools.tasks.barcoding""" # FIXME: replace barcoding dsmd = get_dataset_metadata(dataset_file) ds_file = DataStoreFile(dsmd.uuid, source_id, dsmd.metatype, dataset_file) ds_out = DataStore([ds_file]) ds_out.write_json(datastore_file) return 0
def gather_report(json_files, output_file, dataset_xml=None): """ Combines statistics (usually raw counts) stored as JSON files. Data models: pbcommand.models.report """ reports = [load_report_from_json(fn) for fn in json_files] merged = Report.merge(reports) if dataset_xml is not None: ds_md = get_dataset_metadata(dataset_xml) merged._dataset_uuids = [ds_md.uuid] with open(output_file, "w") as writer: writer.write(merged.to_json()) return output_file
def test_get_dataset_metadata(self): try: import pbcore.io import pbcore.data except ImportError: raise unittest.SkipTest("pbcore not available, skipping") else: ds = pbcore.io.SubreadSet(pbcore.data.getUnalignedBam()) ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(ds_file) md = get_dataset_metadata(ds_file) self.assertEqual(md.metatype, "PacBio.DataSet.SubreadSet") self.assertEqual(md.uuid, ds.uuid)
def test_get_dataset_metadata(self): try: import pbcore.io import pbcore.data except ImportError: raise unittest.SkipTest("pbcore not available, skipping") else: ds = pbcore.io.SubreadSet(pbcore.data.getUnalignedBam()) ds_file = tempfile.NamedTemporaryFile( suffix=".subreadset.xml").name ds.write(ds_file) md = get_dataset_metadata(ds_file) self.assertEqual(md.metatype, "PacBio.DataSet.SubreadSet") self.assertEqual(md.uuid, ds.uuid)
def test_get_dataset_metadata(self): try: import pbtestdata except ImportError: raise unittest.SkipTest("pbtestdata not available, skipping") else: md = get_dataset_metadata(pbtestdata.get_file("subreads-xml")) self.assertEqual(md.metatype, "PacBio.DataSet.SubreadSet") try: from pbcore.io import SubreadSet except ImportError: raise unittest.SkipTest("pbcore not available, skipping") else: ds = SubreadSet(pbtestdata.get_file("subreads-xml")) self.assertEqual(md.uuid, ds.uuid)
def run_import_local_dataset(self, path): """Import a file from FS that is local to where the services are running Returns a JobResult instance :rtype: JobResult """ dataset_meta_type = get_dataset_metadata(path) result = self.get_dataset_by_uuid(dataset_meta_type.uuid) if result is None: log.info("Importing dataset {p}".format(p=path)) return self.run_import_dataset_by_type(dataset_meta_type.metatype, path) else: log.debug("{f} already imported. Skipping importing. {r}".format(r=result, f=dataset_meta_type.metatype)) # need to clean this up return JobResult(self.get_job_by_id(result['jobId']), 0, "")
def run_import_local_dataset(self, path): """Import a file from FS that is local to where the services are running Returns a JobResult instance :rtype: JobResult """ dataset_meta_type = get_dataset_metadata(path) def _verify_dataset_in_list(): file_type = FileTypes.ALL()[dataset_meta_type.metatype] ds_endpoint = _get_endpoint_or_raise(file_type) # all datasets for a specific type datasets = self._get_datasets_by_type(ds_endpoint) uuids = {ds['uuid'] for ds in datasets} if dataset_meta_type.uuid not in uuids: raise JobExeError(("Dataset {u} was imported but does not " + "appear in the dataset list; this may " + "indicate XML schema errors.").format( u=dataset_meta_type.uuid)) result = self.get_dataset_by_uuid(dataset_meta_type.uuid, ignore_errors=True) if result is None: log.info("Importing dataset {p}".format(p=path)) job_result = self.run_import_dataset_by_type( dataset_meta_type.metatype, path) log.info("Confirming database update") # validation 1: attempt to retrieve dataset info result_new = self.get_dataset_by_uuid(dataset_meta_type.uuid) if result_new is None: raise JobExeError( ("Dataset {u} was imported but could " + "not be retrieved; this may indicate " + "XML schema errors.").format(u=dataset_meta_type.uuid)) # validation 2: make sure it shows up in the listing _verify_dataset_in_list() return job_result else: log.info("{f} already imported. Skipping importing. {r}".format( r=result, f=dataset_meta_type.metatype)) _verify_dataset_in_list() # need to clean this up return JobResult(self.get_job_by_id(result['jobId']), 0, "")
def run_import_local_dataset(self, path): """Import a file from FS that is local to where the services are running Returns a JobResult instance :rtype: JobResult """ dataset_meta_type = get_dataset_metadata(path) def _verify_dataset_in_list(): file_type = FileTypes.ALL()[dataset_meta_type.metatype] ds_endpoint = _get_endpoint_or_raise(file_type) # all datasets for a specific type datasets = self._get_datasets_by_type(ds_endpoint) uuids = {ds['uuid'] for ds in datasets} if dataset_meta_type.uuid not in uuids: raise JobExeError(("Dataset {u} was imported but does not "+ "appear in the dataset list; this may "+ "indicate XML schema errors.").format( u=dataset_meta_type.uuid)) result = self.get_dataset_by_uuid(dataset_meta_type.uuid, ignore_errors=True) if result is None: log.info("Importing dataset {p}".format(p=path)) job_result = self.run_import_dataset_by_type(dataset_meta_type.metatype, path) log.info("Confirming database update") # validation 1: attempt to retrieve dataset info result_new = self.get_dataset_by_uuid(dataset_meta_type.uuid) if result_new is None: raise JobExeError(("Dataset {u} was imported but could "+ "not be retrieved; this may indicate "+ "XML schema errors.").format( u=dataset_meta_type.uuid)) # validation 2: make sure it shows up in the listing _verify_dataset_in_list() return job_result else: log.info("{f} already imported. Skipping importing. {r}".format(r=result, f=dataset_meta_type.metatype)) _verify_dataset_in_list() # need to clean this up return JobResult(self.get_job_by_id(result['jobId']), 0, "")