def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets ]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_update_barcoded_sample_metadata(self): datastore_tmp = tempfile.NamedTemporaryFile( suffix=".datastore.json").name barcodes = pbtestdata.get_file("barcodeset") ds = split_barcoded_dataset(self.SUBREADS) ds.write_json(datastore_tmp) base_dir = tempfile.mkdtemp() datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp, self.SUBREADS, barcodes) validate_barcoded_datastore_files(self, self.SUBREADS, datastore) # now with use_barcode_uuids=False datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp, self.SUBREADS, barcodes, use_barcode_uuids=False) validate_barcoded_datastore_files(self, self.SUBREADS, datastore, use_barcode_uuids=False) # test that it doesn't break with no collection metadata ss = SubreadSet(self.SUBREADS) ss.metadata.collections = None ss_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ss.write(ss_tmp) ds = split_barcoded_dataset(ss_tmp) ds.write_json(datastore_tmp) base_dir = tempfile.mkdtemp() datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp, self.SUBREADS, barcodes) validate_barcoded_datastore_files(self, self.SUBREADS, datastore, have_collection_metadata=False, number_of_expected_collections=0)
def test_discard_bio_samples(self): ds = SubreadSet(self.SUBREADS) discard_bio_samples(ds, "lbc1--lbc1") coll = ds.metadata.collections[0] bioSamples = ds.metadata.collections[0].wellSample.bioSamples assert len(bioSamples) == 1 assert bioSamples[0].name == "Alice" # No matching BioSample records ds = SubreadSet(self.SUBREADS) ds.metadata.collections[0].wellSample.bioSamples.pop(1) ds.metadata.collections[0].wellSample.bioSamples.pop(1) bioSample = ds.metadata.collections[0].wellSample.bioSamples[0] while len(bioSample.DNABarcodes) > 0: bioSample.DNABarcodes.pop(0) assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 discard_bio_samples(ds, "lbc1--lbc1") assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 assert ds.metadata.collections[0].wellSample.bioSamples[ 0].name == "lbc1--lbc1" assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[ 0].name == "lbc1--lbc1" # no BioSample records ds = SubreadSet(pbtestdata.get_file("subreads-sequel")) assert len(ds.metadata.collections[0].wellSample.bioSamples) == 0 discard_bio_samples(ds, "lbc1--lbc1") assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 assert ds.metadata.collections[0].wellSample.bioSamples[ 0].name == "lbc1--lbc1" assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[ 0].name == "lbc1--lbc1"
def test_get_barcode_sample_mappings(self): with SubreadSet(self._subreads) as ds: # just double-checking that the XML defines more samples than are # actually present in the BAM assert len(ds.metadata.collections[0].wellSample.bioSamples) == 3 samples = get_barcode_sample_mappings(SubreadSet(self._subreads)) assert samples == {'lbc3--lbc3': 'Charles', 'lbc1--lbc1': 'Alice'}
def setup_class(cls): bam_files = [] with SubreadSet(pbtestdata.get_file("barcoded-subreadset")) as ds_in: for er in ds_in.externalResources: bam_files.append(er.bam) with SubreadSet(*bam_files, strict=True) as ds_out: ds_out.write(cls.INPUT_FILE)
def to_reports(subreads, output_dir): output_files = [] log.info("Loading {f}".format(f=subreads)) ds = SubreadSet(subreads) ds.loadStats() for base, module in [("filter_stats_xml", filter_stats_xml), ("adapter_xml", adapter_xml), ("loading_xml", loading_xml), ("control", control)]: constants = getattr(module, "Constants") task_id = constants.TOOL_ID to_report = getattr(module, "to_report_impl") try: rpt_output_dir = os.path.join(output_dir, base) os.mkdir(rpt_output_dir) file_name = os.path.join(rpt_output_dir, "{b}.json".format(b=base)) report = to_report(ds, rpt_output_dir) log.info("Writing {f}".format(f=file_name)) report.write_json(file_name) output_files.append(DataStoreFile( uuid=report.uuid, source_id=task_id, type_id=FileTypes.REPORT.file_type_id, path=file_name, is_chunked=False, name=base)) except InvalidStatsError as e: log.error("This dataset lacks some required statistics") log.error("Skipping generation of {b} report".format(b=base)) datastore = DataStore(output_files) return datastore
def _generateSubreadSet(output_bam_file): sset = SubreadSet(output_bam_file, generateIndices=True) sset_output_name = output_bam_file[:-12] + 'subreadset.xml' sset.name = sset_output_name.split('.')[0] sset.write(sset_output_name) return sset_output_name
def test_subreadset_from_bam(self): # DONE control experiment for bug 28698 bam = upstreamData.getUnalignedBam() ds1 = SubreadSet(bam, strict=False) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) ds1.write(fn)
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(10)) col = CollectionMetadata() self.assertFalse(ss.metadata.collections) ss.metadata.collections.append(col) self.assertTrue(ss.metadata.collections) col.cellIndex = 1 self.assertTrue(ss.metadata.collections[0].cellIndex, 1) col.instrumentName = "foo" self.assertTrue(ss.metadata.collections[0].instrumentName, "foo") col.context = 'bar' self.assertTrue(ss.metadata.collections[0].context, "bar") ss.metadata.collections[0].runDetails.name = 'foo' self.assertEqual('foo', ss.metadata.collections[0].runDetails.name) ss.metadata.collections[0].wellSample.name = 'bar' self.assertEqual('bar', ss.metadata.collections[0].wellSample.name) ss.metadata.collections[0].wellSample.wellName = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName) ss.metadata.collections[0].wellSample.concentration = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.concentration) ss.write(ofn, validate=False)
def test_dataset_create_set_sample_names(self): sample_args = "--well-sample-name WELLSAMPLE --bio-sample-name BIOSAMPLE".split( ) outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name cmd = " ".join([ "dataset", "create", "--force", outfile, pbtestdata.get_file("subreads-bam") ] + sample_args) self._run_cmd_with_output(cmd, outfile) with SubreadSet(outfile) as ds: assert len(ds.metadata.collections) == 1 assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE" assert ds.metadata.collections[0].wellSample.bioSamples[ 0].name == "BIOSAMPLE" assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1 # now with existing samples outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name cmd = " ".join([ "dataset", "create", "--force", outfile, pbtestdata.get_file("barcoded-subreadset") ] + sample_args) self._run_cmd_with_output(cmd, outfile) with SubreadSet(outfile) as ds: assert len(ds.metadata.collections) == 1 assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE" biosamples = { s.name for s in ds.metadata.collections[0].wellSample.bioSamples } assert biosamples == {"BIOSAMPLE"}
def test_subreadset_split_metadata_element_name(self): fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) sset = SubreadSet(data.getXml(10), data.getXml(13)) chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True) self.assertEqual(len(chunks), 2) chunks[0].write(fn)
def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a list of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True, skipCounts=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) split_fns = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn, skipCounts=True) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) split_fns.append(out_fn) return split_fns
def test_loadMetadata(self): aln = AlignmentSet(data.getXml(7)) assert not aln.metadata.collections aln.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') assert aln.metadata.collections sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None assert not sset.metadata.collections sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn) validateFile(fn) validateFile(sset_fn) assert sset.metadata == orig_metadata # load the wrong thing... sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None assert not sset.metadata.collections with pytest.raises(InvalidDataSetIOError): sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.sts.xml')
def get_sequencing_chemistry(entry_points, include_system_type=True): """ Given a list of entry points (eid, path), extract the sequencing chemistry (and optionally system name) as a human-readable string. """ chemistries = set() is_sequel = is_rsii = False for eid, path in entry_points: if eid == "eid_subread" and op.isfile(path): ds = SubreadSet(path) for bam in ds.resourceReaders(): for rg in bam.readGroupTable: chemistries.add(rg.SequencingChemistry) if rg.SequencingChemistry.startswith("S"): is_sequel = True else: is_rsii = True if len(chemistries) == 0: return "NA" chemistry_str = "; ".join(sorted(list(chemistries))) if include_system_type: fmt = "{s} ({c})" if is_sequel and is_rsii: return fmt.format(s="Mixed", c=chemistry_str) elif is_sequel: return fmt.format(s="Sequel", c=chemistry_str) elif is_rsii: return fmt.format(s="RSII", c=chemistry_str) else: raise ValueError("Can't determine system type for {c}".format( c=chemistry_str)) return chemistry_str
def test_multi_movie_split_zmws(self): N_RECORDS = 1745161 test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2590980/0008/Analysis_Results/m141115_075238_et" "han_c100699872550000001823139203261572_s1_p0.al" "l.subreadset.xml") ds1 = SubreadSet(test_file_1, test_file_2) # used to get total: #self.assertEqual(sum(1 for _ in ds1), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 12) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) self.assertEqual( dss[0].zmwRanges, [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0', 7, 22099)]) self.assertEqual( dss[-1].zmwRanges, [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0', 127819, 163468)])
def test_movie_split(self): N_RECORDS = 1745161 N_RECORDS_1 = 959539 N_RECORDS_2 = 785622 test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2590980/0008/Analysis_Results/m141115_075238_et" "han_c100699872550000001823139203261572_s1_p0.al" "l.subreadset.xml") ds1 = SubreadSet(test_file_1, test_file_2) # used to get total: #self.assertEqual(sum(1 for _ in ds1), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) dss = ds1.split_movies(1) self.assertEqual(len(dss), 1) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) self.assertFalse(ds1.filters) dss = ds1.split_movies(12) self.assertEqual(len(dss), 2) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) self.assertEqual(len(set(dss[0].index.qId)), 1) self.assertEqual(len(set(dss[-1].index.qId)), 1) self.assertEqual( dss[0].qid2mov[list(set(dss[0].index.qId))[0]], 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0') self.assertEqual(len(dss[0]), N_RECORDS_1) self.assertEqual( dss[-1].qid2mov[list(set(dss[-1].index.qId))[0]], 'm141115_075238_ethan_c100699872550000001823139203261572_s1_p0') self.assertEqual(len(dss[-1]), N_RECORDS_2)
def test_loadMetadata(self): aln = AlignmentSet(data.getXml(no=8)) self.assertFalse(aln.metadata.collections) aln.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') self.assertTrue(aln.metadata.collections) sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.run.metadata.xml') fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn) validateFile(fn) validateFile(sset_fn) self.assertEqual(sset.metadata, orig_metadata) # load the wrong thing... sset_fn = ('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.subreadset.xml') sset = SubreadSet(sset_fn) orig_metadata = copy.deepcopy(sset.metadata) sset.metadata.collections = None self.assertFalse(sset.metadata.collections) with self.assertRaises(InvalidDataSetIOError): sset.loadMetadata('/pbi/dept/secondary/siv/testdata/' 'SA3-Sequel/lambda/roche_SAT/' 'm54013_151205_032353.sts.xml')
def test_multi_movie_split_zmws_with_existing_movie_filter(self): # TODO: test with three movies and two chunks N_RECORDS = 959539 test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2590980/0008/Analysis_Results/m141115_075238_et" "han_c100699872550000001823139203261572_s1_p0.al" "l.subreadset.xml") ds1 = SubreadSet(test_file_1, test_file_2) dss = ds1.split_movies(2) self.assertEqual(len(dss), 2) ds1 = dss[0] # used to get total: #self.assertEqual(sum(1 for _ in ds1), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 12) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) for ds in dss: self.assertEqual( ds.zmwRanges[0][0], 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0' )
def test_isBarcoded(self): empty = upstreamdata.getEmptyBam() nonempty = ('/pbi/dept/secondary/siv/testdata/' 'pblaa-unittest/Sequel/Phi29/m54008_160219_003234' '.tiny.subreadset.xml') # One empty one non empty sset = SubreadSet(nonempty, empty, skipMissing=True) self.assertTrue(sset.isBarcoded) # Just nonempty sset = SubreadSet(nonempty, skipMissing=True) self.assertEqual(len(sset), 15133) self.assertTrue(sset.isBarcoded) # Just empty # This is crazy, the pbi must be out of date: sset = SubreadSet(empty) self.assertEqual(len(sset), 0) self.assertTrue(sset.isBarcoded) # To confirm current behavior, I will regenerate the pbi with a # current pbindex: efn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info("Copying to {}".format(efn)) sset.copyTo(efn) sset.induceIndices(force=True) self.assertFalse(sset.isBarcoded)
def test_multi_movie_split_zmws_with_existing_movie_filter(self): # TODO: test with three movies and two chunks N_RECORDS = 959539 test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2372215/0007/Analysis_Results/m150404_101626_42" "267_c100807920800000001823174110291514_s1_p0.al" "l.subreadset.xml") test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/" "2590980/0008/Analysis_Results/m141115_075238_et" "han_c100699872550000001823139203261572_s1_p0.al" "l.subreadset.xml") ds1 = SubreadSet(test_file_1, test_file_2) dss = ds1.split_movies(2) self.assertEqual(len(dss), 2) ds1 = dss[0] # used to get total: #self.assertEqual(sum(1 for _ in ds1), N_RECORDS) self.assertEqual(len(ds1), N_RECORDS) dss = ds1.split(chunks=1, zmws=True) self.assertEqual(len(dss), 1) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) dss = ds1.split(chunks=12, zmws=True) self.assertEqual(len(dss), 12) self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS) for ds in dss: self.assertEqual( ds.zmwRanges[0][0], 'm150404_101626_42267_c100807920800000001823174110291514_s1_p0')
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1, score_mode="symmetric"): if not score_mode in ["asymmetric", "symmetric"]: raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode)) bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") new_prefix = re.sub(".subreadset.xml$", "", output_file_name) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_set_file, "--scoreMode", score_mode, subread_set_file ] log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code assert op.isfile(output_file_name) tmp_out = op.join(op.dirname(output_file_name), "tmp_" + op.basename(output_file_name)) shutil.move(output_file_name, tmp_out) with SubreadSet(tmp_out, strict=True) as ds: with SubreadSet(subread_set_file) as ds_in: ds.metadata = ds_in.metadata ds.name = ds_in.name + " (barcoded)" ds.updateCounts() ds.newUuid() ds.write(output_file_name) return 0
def run_bax_to_bam(input_file_name, output_file_name): with HdfSubreadSet(input_file_name) as ds_in: movies = set() for rr in ds_in.resourceReaders(): movies.add(rr.movieName) if len(movies) > 1: out_dir = os.path.dirname(output_file_name) ds_out_files = [] for bax_file in ds_in.toExternalFiles(): output_file_name_tmp = os.path.join(out_dir, ".".join( os.path.basename(bax_file).split(".")[:-2]) + ".hdfsubreadset.xml") rc = _run_bax_to_bam(bax_file, output_file_name_tmp) if rc != 0: log.error("bax2bam failed") return rc ds_out_files.append(output_file_name_tmp) ds = SubreadSet(*ds_out_files) ds.name = ds_in.name if 'Description' in ds_in.objMetadata: ds.objMetadata['Description'] = ds_in.objMetadata['Description'] ds.metadata.merge(ds_in.metadata) ds.write(output_file_name) else: return _run_bax_to_bam(input_file_name, output_file_name) return 0
def setUpClass(cls): super(TestToolContract, cls).setUpClass() ds = SubreadSet(BAM_FILE, strict=True) ds.write(cls.INPUT_FILES[0]) with FastaWriter(cls.INPUT_FILES[1]) as fa_out: for i in range(1010): fa_out.writeRecord("%04d_Forward" % i, "A" * 16)
def test_len(self): # AlignmentSet aln = AlignmentSet(data.getXml(8), strict=True) self.assertEqual(len(aln), 92) self.assertEqual(aln._length, (92, 123588)) self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 123588) self.assertEqual(aln.numRecords, 92) self.assertEqual(sum(1 for _ in aln), 92) self.assertEqual(sum(len(rec) for rec in aln), 123588) # AlignmentSet with filters aln = AlignmentSet(data.getXml(15), strict=True) self.assertEqual(len(aln), 40) self.assertEqual(aln._length, (40, 52023)) self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) aln.totalLength = -1 aln.numRecords = -1 self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1) aln.updateCounts() self.assertEqual(aln.totalLength, 52023) self.assertEqual(aln.numRecords, 40) # SubreadSet sset = SubreadSet(data.getXml(10), strict=True) self.assertEqual(len(sset), 92) self.assertEqual(sset._length, (92, 124093)) self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 124093) self.assertEqual(sset.numRecords, 92) self.assertEqual(sum(1 for _ in sset), 92) self.assertEqual(sum(len(rec) for rec in sset), 124093) # ReferenceSet sset = ReferenceSet(data.getXml(9), strict=True) self.assertEqual(len(sset), 59) self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59) sset.totalLength = -1 sset.numRecords = -1 self.assertEqual(sset.totalLength, -1) self.assertEqual(sset.numRecords, -1) sset.updateCounts() self.assertEqual(sset.totalLength, 85774) self.assertEqual(sset.numRecords, 59)
def test_output_subreadset_name(self): """ Verify that the output SubreadSet name is identical to the input name plus ' (barcoded)'. """ with SubreadSet(self.entrypoints.data['eid_subread']) as ds_in: with SubreadSet(self._get_subreadset_out()) as ds_out: self.assertEqual(ds_out.name, ds_in.name + " (barcoded)")
def test_bam2fastx_filtered(self): input_file = pbtestdata.get_file("subreads-xml") ds = SubreadSet(input_file, strict=True) ds.filters.addRequirement(length=[('>=', 1000)]) input_tmp = get_temp_file(suffix=".subreadset.xml") ds.write(input_tmp) nrecords_expected = 13 self.run_and_check_fastx(input_tmp, nrecords_expected)
def _make_dataset(file_name=None, barcodes=None): if file_name is None: file_name = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds = SubreadSet(BAM_FILE, strict=True) if barcodes is not None: for er in ds.externalResources: er.barcodes = barcodes ds.write(file_name) return file_name
def to_report(stats_xml, output_dir, dpi=72): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) # stats_xml should be a dataset: dset = SubreadSet(stats_xml) dataset_uuids = [dset.uuid] # but if it isn't, no problem: if not dset.metadata.summaryStats: dset.loadStats(stats_xml) # an sts file was provided which will generate a new random uuid dataset_uuids = [] if not dset.metadata.summaryStats.readLenDists: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") # we want all of the length distributions in this report to look the same, # so we make the shaper here and pass it around: alldists = (dset.metadata.summaryStats.readLenDists[:] + dset.metadata.summaryStats.insertReadLenDists[:]) len_dist_shaper = continuous_dist_shaper(alldists, trim_excess=True) attr = to_read_stats_attributes( readLenDists=dset.metadata.summaryStats.readLenDists, readQualDists=dset.metadata.summaryStats.readQualDists) attr.extend( to_insert_stats_attributes( readLenDists=dset.metadata.summaryStats.insertReadLenDists, readQualDists=dset.metadata.summaryStats.insertReadQualDists)) plot_groups = to_read_stats_plots( readLenDists=dset.metadata.summaryStats.readLenDists, readQualDists=dset.metadata.summaryStats.readQualDists, output_dir=output_dir, lenDistShaper=len_dist_shaper) plot_groups.extend( to_insert_stats_plots( readLenDists=dset.metadata.summaryStats.insertReadLenDists, readQualDists=dset.metadata.summaryStats.insertReadQualDists, output_dir=output_dir, lenDistShaper=len_dist_shaper)) # build the report: report = Report(meta_rpt.id, title=meta_rpt.title, attributes=attr, plotgroups=plot_groups, dataset_uuids=dataset_uuids) return meta_rpt.apply_view(report)
def test_provenance_record_ordering(self): import pbtestdata ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True) ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="") tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(tmp_out) ds = SubreadSet(tmp_out, strict=True) tags = [r['tag'] for r in ds.metadata.record['children']] self.assertEqual(tags, ['TotalLength', 'NumRecords', 'Provenance', 'Collections', 'SummaryStats'])
def setUpClass(cls): tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam) shutil.copyfile(pbcore.data.getUnalignedBam()+".pbi", tmp_bam+".pbi") ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True) ds.write(cls.INPUT_FILES[0]) _write_fasta_or_contigset(cls.INPUT_FILES[1], make_faidx=True, ds_class=BarcodeSet) super(TestScatterSubreadBAMs, cls).setUpClass()
def test_subreadset_split_metadata_element_name(self): fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) sset = SubreadSet("/pbi/dept/secondary/siv/testdata/" "SA3-Sequel/phi29/315/3150101/" "r54008_20160219_002905/1_A01/" "m54008_160219_003234.subreadset.xml") chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True) chunks[0].write(fn)
def _set_up_basic(self): input_file = get_temp_file(suffix=".subreadset.xml") ds = SubreadSet(data.getXml(9), strict=True) ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="") ds.write(input_file) return input_file, len(ds)
def test_file_arg(self): fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 10 qn = [r.qName for r in sset[:size]] with open(fn, 'w') as ofh: for q in qn: ofh.write(q) ofh.write('\n') good_qn = [('=', fn)] sset.filters.addRequirement(qname=good_qn) assert size == sum(1 for _ in sset) assert size == len(sset) og = set(qn) for r in sset: og.discard(r.qName) assert len(og) == 0 fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 10 qn = [r.qName for r in sset[:size]] with open(fn, 'w') as ofh: for q in qn: ofh.write(q) ofh.write('\n') good_qn = [('=', fn)] sset.filters.addRequirement(qname_file=good_qn) assert size == sum(1 for _ in sset) assert size == len(sset) og = set(qn) for r in sset: og.discard(r.qName) assert len(og) == 0 fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name log.debug(fn) sset = SubreadSet(data.getXml(9)) assert len(sset) == 92 size = 4 hn = [r for r in sorted(list(set(sset.index.holeNumber)))[:size]] with open(fn, 'w') as ofh: for h in hn: ofh.write(str(h)) ofh.write('\n') good_hn = [('=', fn)] sset.filters.addRequirement(zm=good_hn) assert size == len(set(sset.index.holeNumber)) og = set(hn) for r in sset: og.discard(r.holeNumber) assert len(og) == 0
def to_report(stats_xml, output_dir, dpi=72): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) # stats_xml should be a dataset: dset = SubreadSet(stats_xml) dataset_uuids = [dset.uuid] # but if it isn't, no problem: if not dset.metadata.summaryStats: dset.loadStats(stats_xml) # an sts file was provided which will generate a new random uuid dataset_uuids = [] if not dset.metadata.summaryStats.readLenDists: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") # we want all of the length distributions in this report to look the same, # so we make the shaper here and pass it around: alldists = (dset.metadata.summaryStats.readLenDists[:] + dset.metadata.summaryStats.insertReadLenDists[:]) len_dist_shaper = continuous_dist_shaper(alldists, trim_excess=True) attr = to_read_stats_attributes( readLenDists=dset.metadata.summaryStats.readLenDists, readQualDists=dset.metadata.summaryStats.readQualDists) attr.extend(to_insert_stats_attributes( readLenDists=dset.metadata.summaryStats.insertReadLenDists, readQualDists=dset.metadata.summaryStats.insertReadQualDists)) plot_groups = to_read_stats_plots( readLenDists=dset.metadata.summaryStats.readLenDists, readQualDists=dset.metadata.summaryStats.readQualDists, output_dir=output_dir, lenDistShaper=len_dist_shaper) plot_groups.extend(to_insert_stats_plots( readLenDists=dset.metadata.summaryStats.insertReadLenDists, readQualDists=dset.metadata.summaryStats.insertReadQualDists, output_dir=output_dir, lenDistShaper=len_dist_shaper)) # build the report: report = Report(meta_rpt.id, title=meta_rpt.title, attributes=attr, plotgroups=plot_groups, dataset_uuids=dataset_uuids) return meta_rpt.apply_view(report)
def setUp(self): BAM_IN = pbcore.data.getUnalignedBam() ds = SubreadSet(BAM_IN, strict=True) chunks = ds.split(zmws=True, chunks=2, targetSize=2) assert len(chunks) == 2 self.zmw_range = chunks[CHUNK_INDEX].zmwRanges[0][1:3] logging.info("zmwRanges[CHUNK_INDEX] = {r}".format( r=str(chunks[CHUNK_INDEX].zmwRanges))) logging.info("SubreadSet = {f}".format(f=self.INPUT_FILES[0])) chunks[CHUNK_INDEX].write(self.INPUT_FILES[0])
def test_get_dataset_uuid(self): ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True) ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(ds_file) uuid = getDataSetUuid(ds_file) assert uuid == ds.uuid with open(ds_file, "w") as out: out.write("hello world!") uuid = getDataSetUuid(ds_file) assert uuid is None
def test_get_dataset_uuid(self): ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True) ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds.write(ds_file) uuid = getDataSetUuid(ds_file) self.assertEqual(uuid, ds.uuid) with open(ds_file, "w") as out: out.write("hello world!") uuid = getDataSetUuid(ds_file) self.assertEqual(uuid, None)
def main(parser): args = parser.parse_args() filt = Filters() dset = SubreadSet(args.inXml) names = nameGen(args.inFile, fileType='list' if args.list else 'fasta') if args.subreads: if args.inverted: for name in names: filt.addRequirement(QNAME=[('!=', name)]) else: filt.addRequirement(QNAME=[('=', name) for name in names]) else: assert len( dset.movieIds ) == 1, 'This method only works for single-movie subreadsets. use --subreads option for multi-movie subreadsets' uniqHn = set(map(getZmw, names)) if args.inverted: for hn in uniqHn: filt.addRequirement(zm=[('!=', hn)]) else: filt.addRequirement(zm=[('=', hn) for hn in uniqHn]) dset.addFilters(filt) if args.newUuid: dset.newUuid() if args.name: dset.name = args.name dset.write(args.outXml)
def test_split_zmws_around_read_groups(self): ds1 = pbtestdata.get_file("subreads-xml") ds2 = pbtestdata.get_file("subreads-sequel") ds = SubreadSet(ds1, ds2) assert len(ds) == 137 # this is still the default behavior chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=True)) assert len(chunks[0]) == 72 assert len(chunks[1]) == 65 # don't break up movies chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=False)) assert len(chunks[0]) == 20 assert len(chunks[1]) == 117 assert np.all(chunks[0].index.qId == -2081539485) assert np.all(chunks[1].index.qId == -1197849594) chunks = list( ds.split(chunks=4, targetSize=1, zmws=True, breakReadGroups=False)) assert [len(c) for c in chunks] == [8, 12, 54, 63] assert np.all(chunks[0].index.qId == -2081539485) assert np.all(chunks[1].index.qId == -2081539485) assert np.all(chunks[2].index.qId == -1197849594) assert np.all(chunks[3].index.qId == -1197849594) # control: single-movie dataset ds = SubreadSet(ds1) chunks1 = list(ds.split(chunks=4, zmws=True, breakReadGroups=False)) chunks2 = list(ds.split(chunks=4, zmws=True, breakReadGroups=True)) assert [len(x) for x in chunks1] == [len(y) for y in chunks2]
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(10)) col = CollectionMetadata() self.assertFalse(ss.metadata.collections) ss.metadata.collections.append(col) self.assertTrue(ss.metadata.collections) col.cellIndex = 1 self.assertTrue(ss.metadata.collections[0].cellIndex, 1) col.instrumentName = "foo" self.assertTrue(ss.metadata.collections[0].instrumentName, "foo") col.context = 'bar' self.assertTrue(ss.metadata.collections[0].context, "bar") ss.metadata.collections[0].runDetails.name = 'foo' self.assertEqual('foo', ss.metadata.collections[0].runDetails.name) ss.metadata.collections[0].wellSample.name = 'bar' self.assertEqual('bar', ss.metadata.collections[0].wellSample.name) ss.metadata.collections[0].wellSample.wellName = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName) ss.metadata.collections[0].wellSample.concentration = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.concentration) # There are no existing biosamples: self.assertFalse( 'BioSamples' in ss.metadata.collections[0].wellSample.tags) # Therefore the metadata is falsy self.assertFalse(ss.metadata.collections[0].wellSample.bioSamples) ss.metadata.collections[0].wellSample.bioSamples.addSample('Clown') self.assertEqual( 'Clown', ss.metadata.collections[0].wellSample.bioSamples[0].name) ss.metadata.collections[0].wellSample.bioSamples[ 0].DNABarcodes.addBarcode('Dentist') self.assertEqual( 'Dentist', ss.metadata.collections[0].wellSample.bioSamples[0]. DNABarcodes[0].name) # check that we are adding one additional biosamples element: self.assertEqual( Counter(ss.metadata.collections[0].wellSample.tags)['BioSamples'], 1) # Therefore the metadata is truthy self.assertTrue(ss.metadata.collections[0].wellSample.bioSamples) ss.write(ofn, validate=False)
def setUpClass(cls): tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam) shutil.copyfile(pbcore.data.getUnalignedBam() + ".pbi", tmp_bam + ".pbi") ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True) ds.write(cls.INPUT_FILES[0]) _write_fasta_or_contigset(cls.INPUT_FILES[1], make_faidx=True, ds_class=BarcodeSet) super(TestScatterSubreadBAMs, cls).setUpClass()
def test_subreads_parent_dataset(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) assert ds1.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4" ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True) assert ds2.metadata.provenance.parentDataSet.uniqueId is None ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4", "PacBio.DataSet.SubreadSet", "timestamped_name") assert ds2.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4" ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds2.write(ds_out, validate=False)
def get_data_stats(entry_points): """ Get basic metrics for input dataset (assumed to be a SubreadSet). """ for eid, path in entry_points: if eid == "eid_subread" and op.isfile(path): ds = SubreadSet(path) n_zmws = 0 for bam in ds.resourceReaders(): n_zmws += len(set(bam.pbi.holeNumber)) return data_stats(n_zmws, ds.numRecords, ds.totalLength) return data_stats("NA", "NA", "NA")
def test_reports_with_fixed_bins(self): # TODO readQualDists are currently unpopulated, turn back on when # they're repopulated # for dist_name, nbins in zip(['medianInsertDists', 'readLenDists', # 'readQualDists'], [200, 200, 50]): for dist_name, nbins in zip(["medianInsertDists", "readLenDists"], [200, 200]): ss = SubreadSet() ss.loadStats(get_fixed_bin_sts()) ss2 = SubreadSet() ss2.loadStats(get_fixed_bin_sts()) # shift ss2 mdist = getattr(ss2.metadata.summaryStats, dist_name)[0].bins mdist = [0, 0, 0] + mdist[:-3] getattr(ss2.metadata.summaryStats, dist_name)[0].bins = mdist ss3 = ss + ss2 ss4 = SubreadSet() ss4.loadStats(get_fixed_bin_sts()) # shift ss4 mdist = getattr(ss4.metadata.summaryStats, dist_name)[0].bins mdist = [0 for _ in mdist] getattr(ss4.metadata.summaryStats, dist_name)[0].bins = mdist dists = getattr(ss4.metadata.summaryStats, dist_name) self.assertEqual(len(dists), 1) for n in [0, 1, 2, 10, 40, 41, 49, 50, 51, 200, 500]: ds = continuous_dist_shaper(dists, nbins=n) fixed_dists = [ds(dist) for dist in dists] self.assertEqual(len(dists[0].bins), nbins) self.assertEqual(len(fixed_dists[0].bins), nbins) self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins)) sss = [ss, ss2, ss3] for sset in sss: dists = getattr(sset.metadata.summaryStats, dist_name) self.assertEqual(len(dists), 1) # 0, requested nbins > numBins fails back to no-op ops = [1, 2, 3, 4, 7, 10, 40, 41, 49, 50, 51, 200, 500] no_ops = [0] for n in no_ops: ds = continuous_dist_shaper(dists, nbins=n) fixed_dists = [ds(dist) for dist in dists] self.assertEqual(len(dists[0].bins), nbins) self.assertEqual(len(fixed_dists[0].bins), nbins) self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins)) for n in ops: ds = continuous_dist_shaper(dists, nbins=n) fixed_dists = [ds(dist) for dist in dists] self.assertEqual(len(dists[0].bins), nbins) self.assertEqual(len(fixed_dists[0].bins), n) self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins))
def test_subreads_parent_dataset(self): ds1 = SubreadSet(data.getXml(no=5), skipMissing=True) self.assertEqual(ds1.metadata.provenance.parentDataSet.uniqueId, "f81cf391-b3da-41f8-84cb-a0de71f460f4") ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True) self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId, None) ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4", "PacBio.DataSet.SubreadSet", "timestamped_name") self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId, "f81cf391-b3da-41f8-84cb-a0de71f460f4") ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds2.write(ds_out, validate=False)
def test_de_novo(self): ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.info(ofn) ss = SubreadSet(data.getXml(10)) col = CollectionMetadata() self.assertFalse(ss.metadata.collections) ss.metadata.collections.append(col) self.assertTrue(ss.metadata.collections) col.cellIndex = 1 self.assertTrue(ss.metadata.collections[0].cellIndex, 1) col.instrumentName = "foo" self.assertTrue(ss.metadata.collections[0].instrumentName, "foo") col.context = 'bar' self.assertTrue(ss.metadata.collections[0].context, "bar") ss.metadata.collections[0].runDetails.name = 'foo' self.assertEqual('foo', ss.metadata.collections[0].runDetails.name) ss.metadata.collections[0].wellSample.name = 'bar' self.assertEqual('bar', ss.metadata.collections[0].wellSample.name) ss.metadata.collections[0].wellSample.wellName = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName) ss.metadata.collections[0].wellSample.concentration = 'baz' self.assertEqual('baz', ss.metadata.collections[0].wellSample.concentration) # There are no existing biosamples: self.assertFalse( 'BioSamples' in ss.metadata.tags) # Therefore the metadata is falsy self.assertFalse(ss.metadata.bioSamples) ss.metadata.bioSamples.addSample('Clown') self.assertEqual('Clown', ss.metadata.bioSamples[0].name) ss.metadata.bioSamples[0].DNABarcodes.addBarcode('Dentist') self.assertEqual('Dentist', ss.metadata.bioSamples[0].DNABarcodes[0].name) # check that we are adding one additional biosamples element: self.assertEqual(Counter(ss.metadata.tags)['BioSamples'], 1) # Therefore the metadata is truthy self.assertTrue(ss.metadata.bioSamples) ss.write(ofn, validate=False)
def split_dataset(subreadset, out_prefix): """ Takes an input dataset, and for each entry generates one separate dataset file, while maintaining all the filters. Returns a FOFN of the generated datasets. To create an example filtered dataset for testing: dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000' """ out_prefix_abs = os.path.abspath(out_prefix) dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns))) fofn = [] for i, bam_fn in enumerate(fns): out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i) new_dataset = SubreadSet(bam_fn) new_dataset.newUuid() new_dataset._filters = copy.deepcopy(dset._filters) new_dataset.write(out_fn) fofn.append(out_fn) return fofn
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name, nproc=1): bc = BarcodeSet(barcode_set_file) if len(bc.resourceReaders()) > 1: raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.") barcode_fasta = bc.toExternalFiles()[0] with SubreadSet(subread_set_file) as ds: # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet ds_new = SubreadSet(strict=True) for ext_res in ds.externalResources: subreads_bam = ext_res.bam scraps_bam = ext_res.scraps assert subreads_bam is not None if scraps_bam is None: raise TypeError("The input SubreadSet must include scraps.") new_prefix = op.join(op.dirname(output_file_name), re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam))) if not op.isabs(subreads_bam): subreads_bam = op.join(op.dirname(subread_set_file), subreads_bam) if not op.isabs(scraps_bam): scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam) args = [ "bam2bam", "-j", str(nproc), "-b", str(nproc), "-o", new_prefix, "--barcodes", barcode_fasta, subreads_bam, scraps_bam ] print args log.info(" ".join(args)) result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) if result.exit_code != 0: return result.exit_code subreads_bam = new_prefix + ".subreads.bam" scraps_bam = new_prefix + ".scraps.bam" assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam) # FIXME we need a more general method for this ext_res_new = ExternalResource() ext_res_new.resourceId = subreads_bam ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile' ext_res_new.addIndices([subreads_bam + ".pbi"]) ext_res_inner = ExternalResources() ext_res_scraps = ExternalResource() ext_res_scraps.resourceId = scraps_bam ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile' ext_res_scraps.addIndices([scraps_bam + ".pbi"]) ext_res_inner.append(ext_res_scraps) ext_res_new.append(ext_res_inner) ds_new.externalResources.append(ext_res_new) ds._filters.clearCallbacks() ds_new._filters = ds._filters ds_new._populateMetaTypes() ds_new.updateCounts() ds_new.write(output_file_name) return 0
def to_zmw_chunked_subreadset_files(subreadset_path, max_total_nchunks, chunk_key, dir_name, base_name, ext): """Identical to to_chunked_subreadset_files, but chunks subreads by ZMW ranges for input to pbccs.""" dset = SubreadSet(subreadset_path, strict=True) dset_chunks = dset.split(chunks=max_total_nchunks, zmws=True) d = {} for i, dset in enumerate(dset_chunks): chunk_id = '_'.join([base_name, str(i)]) chunk_name = '.'.join([chunk_id, ext]) chunk_path = os.path.join(dir_name, chunk_name) dset.write(chunk_path) d[chunk_key] = os.path.abspath(chunk_path) c = PipelineChunk(chunk_id, **d) yield c
def to_report(stats_xml, output_dir, dpi=72): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) # stats_xml should be a dataset: dset = SubreadSet(stats_xml) dataset_uuids = [dset.uuid] # but if it isn't, no problem: if not dset.metadata.summaryStats: dset.loadStats(stats_xml) # an sts file was provided which will generate a new random uuid dataset_uuids = [] if not dset.metadata.summaryStats.readLenDists: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") attr = to_read_stats_attributes( readLenDists=dset.metadata.summaryStats.readLenDists, readQualDists=dset.metadata.summaryStats.readQualDists) attr.extend(to_insert_stats_attributes( readLenDists=dset.metadata.summaryStats.insertReadLenDists, readQualDists=dset.metadata.summaryStats.insertReadQualDists)) plot_groups = to_read_stats_plots( readLenDists=dset.metadata.summaryStats.readLenDists, readQualDists=dset.metadata.summaryStats.readQualDists, output_dir=output_dir) plot_groups.extend(to_insert_stats_plots( readLenDists=dset.metadata.summaryStats.insertReadLenDists, readQualDists=dset.metadata.summaryStats.insertReadQualDists, output_dir=output_dir)) # build the report: report = Report("raw_data_report", title="Raw Data Report", attributes=attr, plotgroups=plot_groups, dataset_uuids=dataset_uuids) return report
def run(subreadset, fofn): dir_name = os.getcwd() maxChunks = 0 dset = SubreadSet(subreadset, strict=True) fns = dset.toFofn() import pprint log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns))) nrecs = len(dset) # HG with 70x coverage => 200G bases total ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human # and we expect about 7-10min per chunk. chunks = nrecs // ts log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts)) log.info('Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'.format( chunks, maxChunks)) dset_chunks = dset.split(zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks, updateCounts=False, #targetSize=1, breakContigs=True ) chunk_fns = [] for i, dset in enumerate(dset_chunks): chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02 chunk_fn = os.path.join(dir_name, chunk_name) dset.updateCounts() dset.write(chunk_fn, validate=False) # , relPaths=True chunk_fns.append(chunk_fn) with open(fofn, 'w') as ofs: for fn in chunk_fns: ofs.write('{}\n'.format(fn)) log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
def get_subread_ZMW_stats(subread_xml, report): """ Fills a dict with: 'numZMW' --- number of sequencing ZMWs 'numSubread' -- number of subreads 'avgZMWlen' -- approximated average ZMW length 'avgSubreadlen' --- average subread length """ subread_lens = [] zmw_lens = defaultdict(lambda: 0) ds = SubreadSet(subread_xml) for rr in ds.resourceReaders(): for zmw, qStart, qEnd in zip(rr.holeNumber, rr.qStart, rr.qEnd): subread_lens.append(qEnd-qStart) zmw_lens[zmw] = max(zmw_lens[zmw], qEnd) report['numZMW'] = len(zmw_lens) report['numSubread'] = len(subread_lens) report['avgZMWlen'] = int(sum(zmw_lens.itervalues())*1./len(zmw_lens)) report['avgSubreadlen'] = int(sum(subread_lens)*1./len(subread_lens))
def test_subreadset_metadata_element_name(self): # without touching the element: sset = SubreadSet(data.getXml(10)) log.debug(data.getXml(10)) fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name log.debug(fn) sset.write(fn) f = ET.parse(fn) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')), 0) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')), 1) # with touching the element: sset = SubreadSet(data.getXml(10)) sset.metadata.description = 'foo' fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name sset.write(fn, validate=False) f = ET.parse(fn) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'SubreadSetMetadata')), 0) self.assertEqual(len(f.getroot().findall( '{http://pacificbiosciences.com/PacBioDatasets.xsd}' 'DataSetMetadata')), 1)
def test_merge_biosamples(self): import pbtestdata ds1 = pbtestdata.get_file("subreads-biosample-1") ds2 = pbtestdata.get_file("subreads-biosample-2") # Case 1: two biosamples ds = SubreadSet(ds1, ds2) samples = [bs.name for bs in ds.metadata.bioSamples] self.assertEqual(samples, ["Alice", "Bob"]) # Case 2: same biosample in both files ds = SubreadSet(ds1, ds1) samples = [bs.name for bs in ds.metadata.bioSamples] self.assertEqual(samples, ["Alice"]) self.assertEqual(len(ds.metadata.bioSamples[0].DNABarcodes), 1) # Case 3: same biosample, different barcodes dsTmp = SubreadSet(ds1) dsTmp.metadata.bioSamples[0].DNABarcodes[0].name = "F7--R7" tmpFile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name dsTmp.write(tmpFile) ds = SubreadSet(ds1, tmpFile) samples = [bs.name for bs in ds.metadata.bioSamples] self.assertEqual(samples, ["Alice"]) bcs = [bc.name for bc in ds.metadata.bioSamples[0].DNABarcodes] self.assertEqual(bcs, ["F1--R1", "F7--R7"])