def test_integration(self): ccs_barcoded = pbtestdata.get_file("ccs-barcoded") datastore = tempfile.NamedTemporaryFile(suffix=".datastore.json").name lima_out = tempfile.NamedTemporaryFile( suffix=".consensusreadset.xml").name ccs_in = tempfile.NamedTemporaryFile( suffix=".consensusreadset.xml").name with ConsensusReadSet(ccs_barcoded) as ccs_tmp: ccs_tmp.name = "My Data (filtered)" ccs_tmp.tags = "ccs,filtered" ccs_tmp.write(ccs_in) ccs_tmp.name = "lima out" ccs_tmp.write(lima_out) ds = DataStore([ DataStoreFile(uuid.uuid4(), "lima", FileTypes.DS_CCS.file_type_id, lima_out) ]) ds.write_json(datastore) args = [ "python3", "-m", "pbcoretools.tasks.make_trimmed_dataset", datastore, ccs_in ] self._check_call(args) with ConsensusReadSet("trimmed.consensusreadset.xml", trustCounts=True) as ccs_out: assert ccs_out.numRecords > 0 assert ccs_out.name == "My Data (trimmed)" assert ccs_out.tags == "ccs"
def run_args(args): dstore = DataStore.load_from_json(os.path.realpath(args.datastore)) ds_in = ConsensusReadSet(args.ccs_in, trustCounts=True) ds_out = ConsensusReadSet(*([f.path for f in dstore.files.values()]), trustCounts=True) sanitize_dataset_tags(ds_out, remove_hidden=True) ds_out.name = ds_in.name.replace(" (filtered)", "") + " (trimmed)" ds_out.subdatasets = [] ds_out.write("trimmed.consensusreadset.xml") return 0
def test_ccs_barcodes_table_asymmetric(self): CCS_DS = op.join(ROOT_DATA_DIR, "ccs", "asym_barcodes", "ccs.consensusreadset.xml") ds = ConsensusReadSet(CCS_DS) r = to_report(ds, tempfile.mkdtemp()) self.assertEqual(r.tables[1].columns[0].values, ['F5--R5', 'F8--R8', 'F20--R20', 'F29--R29', 'F30--R30'])
def test_ccs_mulitple_movies_single_bam(self): """ Check that the report doesn't crash when a single BAM file contains reads from multiple movies """ ds = ConsensusReadSet(self.CCS_BAM) r = to_report(ds, tempfile.mkdtemp())
def test_ccs_barcodes_table(self): CCS_DS = pbtestdata.get_file("ccs-barcoded") ds = ConsensusReadSet(CCS_DS) r = to_report(ds, tempfile.mkdtemp()) self.assertEqual([c.values for c in r.tables[1].columns[0:4]], [["lbc1--lbc1", "lbc3--lbc3"], [1, 1], [1958, 1954], [1958, 1954]]) self.assertAlmostEqual(r.tables[1].columns[4].values[0], 0.9724, places=4) self.assertAlmostEqual(r.tables[1].columns[4].values[1], 0.9926, places=4)
def test_get_bio_sample_name(self): filename = pbtestdata.get_file("subreads-sequel") ds1 = SubreadSet(filename) get_bio_sample_name(ds1) == "Narwhale" filename = pbtestdata.get_file("subreads-biosample-2") ds2 = SubreadSet(filename) get_bio_sample_name(ds2) == "UnnamedSample" ds3 = ds1 + ds2 get_bio_sample_name(ds3) == "Multiple" filename = pbtestdata.get_file("rsii-ccs-multi-cell") ds4 = ConsensusReadSet(filename) get_bio_sample_name(ds4) == "Multiple" filename = pbtestdata.get_file("ccs-sequel") ds4 = ConsensusReadSet(filename) get_bio_sample_name(ds4) == "NarwhalCcs"
def run_dev_ccs_report(rtc): from pbcore.io import ConsensusReadSet with ConsensusReadSet(rtc.task.input_files[0]) as ds: ds.updateCounts() attr = [ Attribute("number_of_records", value=ds.numRecords), Attribute("total_length", value=ds.totalLength) ] report = Report("ccs_report", title="ConsensusReadSet XML Report", attributes=attr) report.write_json(rtc.task.output_files[0]) return 0
def add_bash5(self, filename): """Add a bas.h5/ccs.h5/ccs.bam to cacher.""" basename = os.path.basename(filename) if filename.endswith('.bax.h5'): movie = basename[:-9] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-9], suffix='.bax.h5') elif filename.endswith('.1.ccs.h5') or \ filename.endswith('.2.ccs.h5') or \ filename.endswith('.3.ccs.h5'): movie = basename[:-9] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-9]) elif filename.endswith('.ccs.h5'): # a single .ccs.h5 (post 150k runs), treat the same as .bas.h5 movie = basename[:-7] self.bas_files[movie] = defaultdict(lambda: filename) elif filename.endswith('.1.subreads.bam') or \ filename.endswith('.2.subreads.bam') or \ filename.endswith('.3.subreads.bam'): movie = basename[:-15] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-15]) elif filename.endswith('subreads.bam'): raise NotImplementedError( "%s add_bash5 *.subreads.bam not implemented." % (self.__class__.__name__)) elif filename.endswith('.1.ccs.bam') or \ filename.endswith('.2.ccs.bam') or \ filename.endswith('.3.ccs.bam'): movie = basename[:-10] if movie not in self.bas_files: self.bas_files[movie] = smrt_wrapper(filename[:-10]) elif filename.endswith('.bas.h5'): movie = basename[:-7] self.bas_files[movie] = defaultdict(lambda: filename) elif filename.endswith(".consensusreadset.xml"): ds = ConsensusReadSet(filename) for rr in ds.resourceReaders(): for rg in rr.readGroupTable: self.bas_files[rg.MovieName] = dataset_wrapper(filename) else: raise IOError("Unsupported file format: %s" % filename)