def ReadAdaptersFromScraps(bam): handles = [] if bam.lower().endswith(".scraps.bam"): handles.append(IndexedBamReader(bam)) else: # Iterate through each external resource, looking for scraps files to read ds = openDataSet(bam) for er in ds.externalResources: try: handle = IndexedBamReader(er.scraps) except: continue handles.append(handle) adps = defaultdict(int) polyA = defaultdict(int) for handle in handles: # Parse the scraps.bam as usual for record in handle: if record.scrapType != "A": continue hn = record.holeNumber seq = record.peer.seq adps[hn] += 1 tFrac = sum(1 for b in seq if b == "T") / float(len(seq)) if tFrac > MIN_T: polyA[hn] += 1 # Convert our counts into a T/F depending on whether there are polyAs res = {} for hn, v in adps.iteritems(): if v >= 2: res[hn] = "T" if polyA[hn] >= 1 else "F" return res
def __init__(self, subreadsFname, scrapsFname=None): if not subreadsFname.endswith(".subreads.bam"): raise Exception, "Expecting a subreads.bam" if scrapsFname is None: scrapsFname = subreadsFname.replace("subreads.bam", "scraps.bam") self.subreadsF = IndexedBamReader(subreadsFname) self.scrapsF = IndexedBamReader(scrapsFname) if (len(self.subreadsF.movieNames) != 1 or self.scrapsF.movieNames != self.subreadsF.movieNames): raise Exception, "Requires single movie BAM file, and matching scraps"
def test_reheader_bam(self): ofn = "subreads_out.bam" bam_file = pbtestdata.get_file("subreads-bam") reheader_bam(bam_file, ofn, self.BIOSAMPLE_NAME, self.LIBRARY_NAME) assert op.isfile(ofn) and op.isfile(ofn + ".pbi") with IndexedBamReader(ofn) as bam_out: self._validate_bam(bam_out) with IndexedBamReader(bam_file) as bam_in: self._validate_input_bam(bam_in) self._validate_records(bam_in, bam_out)
def ReadAdaptersFromScraps(bam, windows): handles = [] if bam.lower().endswith(".scraps.bam"): handles.append(IndexedBamReader(bam)) else: # Iterate through each external resource, looking for scraps files to read ds = openDataSet(bam) for er in ds.externalResources: try: handle = IndexedBamReader(er.scraps) except: continue handles.append(handle) adps = defaultdict(list) for handle in handles: for record in handle: if record.scrapType != "A": continue hn = record.holeNumber # Skip records without alignments that passed QC try: qS, qE, _, _, _, _, _ = windows[hn] except: continue # Skip records for ZMWs other than the one selected for it's alignment if record.qStart not in [qS, qE] and record.qEnd not in [qS, qE]: continue # If we made it this far, record the position and type of adapter seq = record.peer.seq tFrac = sum(1 for b in seq if b == "T") / float(len(seq)) if tFrac < MIN_T: adps[hn].append((record.qStart, "TC6")) else: adps[hn].append((record.qStart, "POLYA")) # Convert our counts into a T/F depending on whether there are polyAs results = {} for hn, adpData in adps.iteritems(): if len(adpData) != 2: print "ERROR! ERROR! {0} adps for hn #{1}".format( len(adpTypes), hn) # Using the strand, sort the adps left-to-right (by alignment) _, _, _, _, _, strand, _ = windows[hn] if strand == 0: adpData = sorted(adpData) else: adpData = sorted(adpData, reverse=True) # Now ordered we can record both ADP types and locations leftTc6 = "T" if adpData[0][1] == "TC6" else "F" rightTc6 = "T" if adpData[1][1] == "TC6" else "F" leftPolyA = "T" if adpData[0][1] == "POLYA" else "F" rightPolyA = "T" if adpData[1][1] == "POLYA" else "F" results[hn] = (leftTc6, rightTc6, leftPolyA, rightPolyA) return results
def test_empty_bam_reads_in_range(self): with IndexedBamReader(data.getEmptyAlignedBam()) as bam: reads = bam.readsInRange("lambda_NEB3011", 0, 50000, justIndices=True) assert len(reads) == 0
def __read_bam(fn): if op.exists(fn + ".pbi"): with IndexedBamReader(fn) as bam_in: return bam_in else: with BamReader(fn) as bam_in: return bam_in
def _verify_write_compare_ccs(testobj, inbamfns, zmws, outbamfn, expected_movies, expected_len): """First verify input.bam and input.bam.pbi exist, next, extract zmws from input and write to an output bam, then compare ccs reads and zmws in input and output. """ testobj.assertTrue(all(op.exists(fn) for fn in inbamfns)) testobj.assertTrue(all(op.exists(fn + ".pbi") for fn in inbamfns)) reader = BamCollection(*inbamfns) # verify movie names and length of reader. testobj.assertTrue(set(reader.movieNames) == set(expected_movies)) testobj.assertTrue( len(reader) == expected_len, "%d != %d" % (len(reader), expected_len)) # write ccs reads. with BamWriter(outbamfn, reader.header) as writer: for zmw in zmws: writer.write(reader[zmw].ccsRead) # make pbi and check make_pbi(outbamfn) testobj.assertTrue(op.exists(outbamfn + ".pbi")) # compare ccs reads in input and output. reader2 = IndexedBamReader(outbamfn) outzmws = [] for r in reader2: outzmws.append(r.zmwName) other = reader[r.readName] testobj.assertTrue(compareBamRecords(r, other)) # compare ccs zmws in input and output testobj.assertTrue(set(zmws) == set(outzmws))
def readAlignments( alnFile, adps, minAlnLength=MIN_ALN_LENGTH, nReads=N_READS ): # Using that reader, parse the regions aligned to known adapters queryAdps = defaultdict(list) queryData = {} count = 0 for record in IndexedBamReader( alnFile ): if record.tEnd - record.tStart < minAlnLength: continue count += 1 if nReads and count > nReads: break zmw = "{0}/{1}".format(record.movieName, record.holeNumber) refAdps = adps[record.referenceName] alnAdps = [adp for adp in refAdps if adp[0] < record.tEnd if adp[1] > record.tStart] queryData[zmw] = AlignmentData( record ) read = record.read(aligned=False, orientation="genomic") for adpStart, adpEnd, adpType in alnAdps: clip = record.clippedTo(adpStart, adpEnd) # Skip adapters in SVs / large deletions, since we never had a chance if clip.aStart == clip.aEnd: continue aStart = clip.aStart - record.aStart aEnd = clip.aEnd - record.aStart adpSeq = read[aStart:aEnd] alnAdp = AlignmentAdapter(zmw, clip.aStart, clip.aEnd, adpType, adpSeq) queryAdps[zmw].append( alnAdp ) return (queryData, queryAdps)
def alignment_info_from_bam(bam_file_name): """ Extract subread information from an indexed BAM file. This should be relatively fast since it will not access the BAM records directly. """ by_movie = {} last_zmw_id = None with IndexedBamReader(bam_file_name) as bam: if len(bam) > 0: identities = bam.identity subread_lengths = bam.aEnd - bam.aStart for i_aln, rgId in enumerate(bam.qId): movie_name = bam.readGroupInfo(rgId).MovieName if not movie_name in by_movie: by_movie[movie_name] = MovieAlignmentInfo(bam_file_name, movie_name) m = by_movie[movie_name] hole_number = bam.holeNumber[i_aln] qs, qe = bam.qStart[i_aln], bam.qEnd[i_aln] rstart, rend = bam.aStart[i_aln], bam.aEnd[i_aln] identity = None if (qs, qe) == (-1, -1): qs = 0 # XXX This is only used to key subreads so the exact value is # not important - still clumsy though qe = rend - rstart # Compound ids zmw_id = (movie_name, hole_number) subread_id = (movie_name, hole_number, qs, qe) this_a = [] this_a.append(subread_lengths[i_aln]) this_a.append(identities[i_aln]) this_a.append(bam.readQual[i_aln]) this_a.append(1.0 if zmw_id != last_zmw_id else 0.0) # isFirst # modStart, a value without a clear meaning, so just write some # garbage this_a.append(99999) last_zmw_id = zmw_id if subread_id in m.datum: warnings.warn("Duplicate subread %s" % str(subread_id)) # No Z-score m.datum[subread_id] = tuple(this_a) if zmw_id not in m.max_subread or subread_lengths[i_aln] > m.max_subread[zmw_id][1]: m.max_subread[zmw_id] = (subread_id, subread_lengths[i_aln]) m.unrolled.setdefault(zmw_id, [99999, 0]) m.unrolled[zmw_id][0] = min(m.unrolled[zmw_id][0], rstart) m.unrolled[zmw_id][1] = max(m.unrolled[zmw_id][1], rend) return by_movie #datum, unrolled, max_subread
def ReadAlignedBamFile(fns, tList): # Dictionaries for tracking adapter results adps = defaultdict(int) altAdps = defaultdict(int) # Dictionaries for tracking ZMW-level results cov = defaultdict(int) windows = {} for fn in fns: for record in IndexedBamReader(fn): # Skip secondary alignments if record.MapQV == 0: continue # We have nothing to learn from subreads with no adapters leftT, rightT = ParseAdapterTypes(record) if leftT is None and rightT is None: continue hn = record.holeNumber # Adapter stats are duplicated between the subreads that sandwich them, so # we arbitrarily pick the right side here to avoid double-counts when # recording information about our adapters if rightT is not None: adps[hn] += 1 if rightT == 1: altAdps[hn] += 1 # If we made it past all of our filters, parse the rest of the data we want: tId = record.tId tStart = record.tStart tEnd = record.tEnd tCov = tEnd - tStart # Search our target list for targets that overlap our current subread target = "OFF" for tName, _, tTid, _, tRS, tRE, _ in tList: if tStart < tRS and tEnd > tRE: target = tName break # If our coverage for this subread is better than anything we've already seen # for this ZMW, keep it instead if tCov > cov[hn]: cov[hn] = tCov windows[hn] = (hn, tId, tStart, tEnd, target) # Convert our dictionary of windows to a flat, sorted list windowResults = sorted(v for k, v in windows.iteritems()) # Convert our adapter counts into a T/F depending on whether there are polyAs adpResults = {} for hn, v in adps.iteritems(): if v >= 2: adpResults[hn] = "T" if altAdps[hn] >= 1 else "F" # Return a tuple containing both our cleaned up Window and Adapter results return (windowResults, adpResults)
def test_alignment_identity(self): """ Check that the values of the 'identity' property are consistent between IndexedBamReader (numpy array) and BamAlignment (float) """ fn = data.getAlignedBam() with IndexedBamReader(fn) as bam_in: i1 = bam_in.identity i2 = np.array([rec.identity for rec in bam_in]) assert (i2 == i1).all()
def test_alignment_identity_unindexed(self): """ Check that the value of the 'identity' property is the same whether or not the .pbi index was used to calculate it. """ fn1 = data.getAlignedBam() fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name shutil.copyfile(fn1, fn2) with IndexedBamReader(fn1) as bam_pbi: with BamReader(fn2) as bam_noindex: i1 = np.array([rec.identity for rec in bam_pbi]) i2 = np.array([rec.identity for rec in bam_noindex]) assert (i2 == i1).all()
def _iter_bam_files(input_file): if input_file.endswith(".xml"): with openDataFile(input_file) as ds_in: if not ds_in.isIndexed: log.warning("Unindexed file(s), this may be very slow") for rr in ds_in.resourceReaders(): yield rr else: if op.exists(input_file + ".pbi"): with IndexedBamReader(input_file) as bam_in: yield bam_in else: with BamReader(input_file) as bam_in: yield bam_in
def test_split_bam(self): bam_file1 = self._get_bam_path(self.DS1) CHUNKS_IN = [1, 2, 3, 4] CHUNKS_OUT = [1, 2, 3, 3] for n_in, n_expected in zip(CHUNKS_IN, CHUNKS_OUT): nchunks = split_bam(bam_file1, n_in) assert nchunks == n_expected bam_in = IndexedBamReader(bam_file1) records_in = [rec.qName for rec in bam_in] records_out = [] for i in range(n_expected): bam_out = BamReader("reads.chunk%d.bam" % i) records_out.extend([rec.qName for rec in bam_out]) assert records_in == records_out self._remove_all()
def openIndexedAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (BAM), requires bam.pbi index The reference FASTA, if provided, must have a FASTA index (fasta.fai). """ if fname.endswith(".h5"): raise_no_h5() elif fname.endswith("bam"): return IndexedBamReader(fname, referenceFastaFname=referenceFastaFname, sharedIndex=sharedIndex) else: raise ValueError("Invalid alignment file suffix")
def getMetrics(cls): cls.subreads_file = None for file_id, file_info in cls.datastore.get_file_dict().iteritems(): if file_info.file_type_id == FileTypes.DS_SUBREADS.file_type_id: cls.subreads_file = file_info.path break with SubreadSet(cls.subreads_file) as ds_in: cls.metric_dict["n_subreads"] = len(ds_in) cls.zmws = set() for bam in ds_in.resourceReaders(): cls.zmws.update(set(list(bam.holeNumber))) cls.metric_dict["n_reads"] = len(cls.zmws) n_bases = 0 for er in ds_in.externalResources: for bam_file in [er.bam, er.scraps]: bam = IndexedBamReader(bam_file) n_bases += int((bam.qEnd - bam.qStart).sum()) cls.metric_dict["n_bases"] = n_bases
def __init__(self, subread_set_path, zmws=None, subsampleto=None): self.subread_set_path = subread_set_path self.subread_set = SubreadSet(subread_set_path) self.framerate = self.subread_set.resourceReaders( )[0].readGroupTable.FrameRate[0] self.subsampleto = subsampleto dsets = [(self.subread_set, 'subreads')] # grab path to scraps if available if self.subread_set.externalResources[0].scraps: self.scraps = IndexedBamReader( self.subread_set.externalResources[0].scraps) dsets.append((self.scraps, 'scraps')) self.ppa_burst_dtypes = self._set_ppa_burst_dtypes( ) # column info of burst table self.reads_dtypes = self._set_reads_dtypes( ) # column info of reads table if self._hasPpaBurstInfo(self.subread_set): if zmws is None: self.zmws = self._subsample_zmws() else: self.zmws = zmws log.info('Number of ZMWs ' + str(len(zmws))) results = [] # if scraps info was present, scrape that for burst info, too for dset in reversed(dsets): ppa_bursts, reads = self.retrieve_classifier_bursts( dset[0], dset[1]) results.append((ppa_bursts, reads)) if len(results) == 1: self.ppa_bursts = results[0][0] self.reads = results[0][1] elif len(results) == 2: subread_ppa_bursts = results[0][0] subread_reads = results[0][1] scraps_ppa_bursts = results[1][0] scraps_reads = results[1][1] self.ppa_bursts = np.hstack( (subread_ppa_bursts, scraps_ppa_bursts)) self.reads = np.hstack((subread_reads, scraps_reads))
def openIndexedAlignmentFile(fname, referenceFastaFname=None, sharedIndex=None): """ Factory function to get a handle to a reader for an alignment file (cmp.h5 or BAM), requiring index capability (built-in for cmp.h5; requires bam.pbi index for BAM The reference FASTA, if provided, must have a FASTA index (fasta.fai). """ if fname.endswith("cmp.h5"): return CmpH5Reader(fname, sharedIndex=sharedIndex) elif fname.endswith("bam"): return IndexedBamReader(fname, referenceFastaFname=referenceFastaFname, sharedIndex=sharedIndex) else: raise ValueError, "Invalid alignment file suffix"
def ReadAdaptersFromScraps(bam): adps = defaultdict(int) polyA = defaultdict(int) with IndexedBamReader(bam) as handle: for record in handle: if record.scrapType != "A": continue hn = record.holeNumber seq = record.peer.seq adps[hn] += 1 tFrac = sum(1 for b in seq if b == "T") / float(len(seq)) if tFrac > MIN_T: polyA[hn] += 1 # Convert our counts into a T/F depending on whether there are polyAs res = {} for hn, v in adps.iteritems(): if v >= 2: res[hn] = "T" if polyA[hn] >= 1 else "F" return res
def test_combine_with_header(self): bam_file = self._get_bam_path(self.DS1) bam_size = op.getsize(bam_file) # see above - these are known boundaries for this particular input byte_ranges = [(396, 26575), (26575, 77209), (77209, bam_size)] with open(bam_file, "rb") as bam_in: with open("header.bam", "wb") as header_out: header_out.write(bam_in.read(396)) for i, (start, end) in enumerate(byte_ranges): with open("tmp.chunk%d.bam" % i, "wb") as chunk_out: bam_in.seek(start) nbytes = end - start chunk_out.write(bam_in.read(nbytes)) for i in range(3): combine_with_header("header.bam", "tmp.chunk%d.bam" % i, "combined.chunk%d.bam" % i) bam_in = IndexedBamReader(bam_file) records_in = [rec.qName for rec in bam_in] records_out = [] for i in range(3): bam_out = BamReader("combined.chunk%d.bam" % i) records_out.extend([rec.qName for rec in bam_out]) assert records_in == records_out
def ReadAlignedBamFile(genome, fns): # Dictionaries for tracking ZMW-level results tDict = genome.targetDictionary() cov = defaultdict(int) adps = {} windows = {} for fn in fns: for record in IndexedBamReader(fn): # Skip secondary alignments if record.MapQV == 0: continue hn = record.holeNumber tId = record.tId tStart = record.tStart tEnd = record.tEnd tCov = tEnd - tStart adpTypes = ParseAdapterTypes(record) # Search our target list for targets that overlap our current subread target = "OFF" for tName, _, tTid, _, tRS, tRE, _ in tDict[tId]: if tStart < tRS and tEnd > tRE: target = tName break # If our coverage for this subread is better than anything we've already seen # for this ZMW, keep it instead if tCov > cov[hn]: cov[hn] = tCov windows[hn] = (hn, tId, tStart, tEnd, target) adps[hn] = adpTypes # Return a tuple containing both our cleaned up Window and Adapter results return (windows, adps)
def main(parser): args = parser.parse_args() def makeFqName(bcPair): return '{}/{}--{}.fastq'.format(args.outDir, *[bcNames[i] for i in bcPair]) bcNames = { i: rec.name for i, rec in enumerate(FastaReader(args.barcodeFasta)) } bcNames[-1] = 'NoBC' bam = IndexedBamReader(args.ccsBAM) for bcPair in set(zip(bam.bcForward, bam.bcReverse)): with FastqWriter(makeFqName(bcPair)) as writer: for rec in bam[(bam.bcForward == bcPair[0]) & (bam.bcReverse == bcPair[1])]: header = rec.readName if not args.noBcQual: header += ' bq=%i' % rec.bcQual writer.writeRecord(header, rec.read(aligned=False), rec.peer.query_qualities)
def _verify_write_compare_subreads(testobj, inbamfns, zmws, outbamfn): """First verify that input bam and pbi files exist, next extract zmws from inputs and write to outbamfn, then compare bam records in input and output.""" # Verify that input.bam and input.bam.pbi exist testobj.assertTrue(all(op.exists(fn) for fn in inbamfns)) testobj.assertTrue(all(op.exists(fn + ".pbi") for fn in inbamfns)) reader = BamCollection(*inbamfns) writer = BamWriter(outbamfn, reader.header) for zmw in zmws: for sr in reader[zmw].subreads: writer.write(sr) writer.close() # make pbi for outbamfn make_pbi(outbamfn) testobj.assertTrue(op.exists(outbamfn + ".pbi")) # Read subreads from outbamfn and compare. reader2 = IndexedBamReader(outbamfn) for r in reader2: other = reader[r.readName] testobj.assertTrue(compareBamRecords(r, other))
class ZmwReadStitcher(object): """ A reader class that enables viewing the read records corresponding to a given ZMW, as present in a paired subreads.bam and scraps.bam, as if they were a contiguous ZMW read record. """ def __init__(self, subreadsFname, scrapsFname=None): if not subreadsFname.endswith(".subreads.bam"): raise Exception, "Expecting a subreads.bam" if scrapsFname is None: scrapsFname = subreadsFname.replace("subreads.bam", "scraps.bam") self.subreadsF = IndexedBamReader(subreadsFname) self.scrapsF = IndexedBamReader(scrapsFname) if (len(self.subreadsF.movieNames) != 1 or self.scrapsF.movieNames != self.subreadsF.movieNames): raise Exception, "Requires single movie BAM file, and matching scraps" @property def filename(self): return self.subreadsF.filename @property def hasPulseFeatures(self): return (self.subreadsF.hasPulseFeatures() and self.scrapsF.hasPulseFeatures()) @property @cached def sequencingZmws(self): """ Hole numbers for which we have basecalls and an HQ region """ return sorted(set(self.subreadsF.holeNumber)) @property @cached def allSequencingZmws(self): """ Hole numbers for which we have basecalls """ return sorted(set.union(set(self.subreadsF.holeNumber), set(self.scrapsF.holeNumber))) def __getitem__(self, holeNumber): if holeNumber not in self.allSequencingZmws: raise IndexError, "Requested hole number has no entry in this BAM file" subreads = self.subreadsF.readsByHoleNumber(holeNumber) scraps = self.scrapsF.readsByHoleNumber(holeNumber) combined = sorted(subreads + scraps, key=lambda x: x.qStart) return StitchedZmw(self, combined) @property @cached def featureDescs(self): rgs = self.subreadsF.peer.header["RG"] assert len(rgs) == 1 rg = rgs[0] dsEntries = set(pair.split("=")[0] for pair in rg["DS"].split(";")) manifestNames = dsEntries.intersection(_possibleFeatureManifestNames) return { desc.accessorName : desc for desc in FEATURE_DESCS if desc.nameInManifest in manifestNames } @property @cached def frameRate(self): return self.subreadsF.readGroupTable[0].FrameRate @property @cached def movieName(self): mns = list(self.subreadsF.movieNames) assert len(mns) == 1 return mns[0]
def test_empty_bam(self): fn = data.getEmptyBam() bam = IndexedBamReader(fn) EQ(len(bam), 0)
def test_empty_bam(self): fn = data.getEmptyBam() bam = IndexedBamReader(fn) assert len(bam) == 0
def setup_class(cls): cls.f = IndexedBamReader(cls.BAM_FILE)
def test_read_lima_demultiplexed_bam(self): fn = "/pbi/dept/secondary/siv/testdata/pbcore-unittest/data/demultiplex.lbc1--lbc1.bam" bam = IndexedBamReader(fn) assert str( bam[0] ) == "Unmapped BAM record: m54008_160219_003234/74056024/1184_3910"
def filter_reads(input_bam, output_bam, whitelist=None, blacklist=None, percentage=None, count=None, seed=None, ignore_metadata=False, relative=None, anonymize=False, use_barcodes=False, sample_scraps=False, keep_original_uuid=False, use_subreads=False, min_adapters=None): _validate_settings(output_bam, whitelist, blacklist, percentage, count, min_adapters) output_bam = op.abspath(output_bam) if seed is not None: random.seed(seed) output_ds = base_name = None if output_bam.endswith(".xml"): if not input_bam.endswith(".xml"): raise UserError( "DataSet output only supported for DataSet inputs.") ds_type = output_bam.split(".")[-2] ext2 = OrderedDict([("subreadset", "subreads"), ("alignmentset", "subreads"), ("consensusreadset", "ccs"), ("consensusalignmentset", "ccs"), ("transcriptset", "transcripts"), ("transcriptalignmentset", "transcripts")]) if not ds_type in ext2: raise ValueError( "Invalid output file extension '{t}.xml'; valid extensions are:\n{e}" .format(t=ds_type, e="\n".join([" %s.xml" % e for e in ext2.keys()]))) output_ds = output_bam base_name = ".".join(output_ds.split(".")[:-2]) output_bam = base_name + "." + ".".join([ext2[ds_type], "bam"]) if output_bam == input_bam: raise UserError("Input and output files must not be the same path") elif not output_bam.endswith(".bam"): raise UserError("Output file name must end in either '.bam' or '.xml'") n_file_reads = 0 have_zmws = set() scraps_bam = barcode_set = sts_xml = None with openDataFile(input_bam) as ds_in: if not isinstance(ds_in, ReadSet): raise UserError("{t} is not an allowed dataset type".format( t=type(ds_in).__name__)) # TODO(nechols)(2016-03-11): refactor this to enable propagation of # filtered scraps if not ds_in.isIndexed: raise UserError("Input BAM must have accompanying .pbi index") for ext_res in ds_in.externalResources: if ext_res.barcodes is not None: assert barcode_set is None or barcode_set == ext_res.barcodes barcode_set = barcode_set if ext_res.sts is not None: if sts_xml is None: sts_xml = ext_res.sts else: log.warning("Multiple sts.xml files, will not propagate") f1 = ds_in.resourceReaders()[0] if percentage is not None or count is not None or min_adapters is not None: bam_readers = list(ds_in.resourceReaders()) if sample_scraps: for ext_res in ds_in.externalResources: if ext_res.scraps is not None: scraps_in = IndexedBamReader(ext_res.scraps) bam_readers.append(scraps_in) whitelist = _create_whitelist(bam_readers, percentage, count, min_adapters) # convert these to Python sets if use_subreads: _whitelist = _process_subread_list(whitelist) _blacklist = _process_subread_list(blacklist) else: _whitelist = _process_zmw_list(whitelist) _blacklist = _process_zmw_list(blacklist) scraps_in = None if output_ds is not None and output_ds.endswith(".subreadset.xml"): for ext_res in ds_in.externalResources: if ext_res.scraps is not None: if use_barcodes: log.warning("Scraps BAM is present but lacks " + "barcodes - will not be propagated " + "to output SubreadSet") else: scraps_in = IndexedBamReader(ext_res.scraps) break with AlignmentFile(output_bam, 'wb', template=f1.peer) as bam_out: for bam_in in ds_in.resourceReaders(): n_records, have_zmws_ = _process_bam_whitelist( bam_in, bam_out, whitelist=_whitelist, blacklist=_blacklist, use_barcodes=use_barcodes, anonymize=anonymize, use_subreads=use_subreads, qid2mov=ds_in.qid2mov) n_file_reads += n_records have_zmws.update(have_zmws_) if scraps_in is not None: scraps_bam = re.sub("subreads.bam$", "scraps.bam", output_bam) with AlignmentFile(scraps_bam, 'wb', template=scraps_in.peer) as scraps_out: for ext_res in ds_in.externalResources: if ext_res.scraps is not None: scraps_in_ = IndexedBamReader(ext_res.scraps) n_records, have_zmws_ = _process_bam_whitelist( scraps_in_, scraps_out, _whitelist, _blacklist, use_barcodes=use_barcodes, anonymize=anonymize, use_subreads=use_subreads) have_zmws.update(have_zmws_) if n_file_reads == 0: log.warn("No reads written") else: log.info("{n} records from {z} ZMWs written".format(n=n_file_reads, z=len(have_zmws))) def _run_pbindex(bam_file): try: rc = subprocess.call(["pbindex", bam_file]) except OSError as e: if e.errno == 2: log.warning("pbindex not present, will not create .pbi file") else: raise _run_pbindex(output_bam) if output_ds is not None: with openDataSet(input_bam) as ds_in: ds_out = ds_in.__class__(output_bam) if scraps_bam is not None: _run_pbindex(scraps_bam) ds_out.externalResources[0].scraps = scraps_bam # XXX it doesn't pick up the .pbi file - sort of annoying # but since the pbcore API doesn't provide a read for the # scraps automatically anyway, the impact is minimal if barcode_set is not None: ds_out.externalResources[0].barcodes = barcode_set if sts_xml is not None: sts_xml_out = base_name + ".sts.xml" log.info("Copying {s} to {d}".format(s=sts_xml, d=sts_xml_out)) shutil.copyfile(sts_xml, sts_xml_out) ds_out.externalResources[0].sts = sts_xml_out if not ignore_metadata: ds_out.metadata = ds_in.metadata ds_out.updateCounts() ds_out.name = ds_in.name + " (bamsieve)" ds_out.tags = ds_in.tags if relative: ds_out.makePathsRelative(op.dirname(output_ds)) if keep_original_uuid: log.warning("Keeping input UUID {u}".format(u=ds_in.uuid)) ds_out.objMetadata["UniqueId"] = ds_in.uuid ds_out.write(output_ds) log.info("wrote {t} XML to {x}".format(t=ds_out.__class__.__name__, x=output_ds)) return 0
class ZmwReadStitcher(object): """ A reader class that enables viewing the read records corresponding to a given ZMW, as present in a paired subreads.bam and scraps.bam, as if they were a contiguous ZMW read record. """ def __init__(self, subreadsFname, scrapsFname=None): if not subreadsFname.endswith(".subreads.bam"): raise Exception, "Expecting a subreads.bam" if scrapsFname is None: scrapsFname = subreadsFname.replace("subreads.bam", "scraps.bam") self.subreadsF = IndexedBamReader(subreadsFname) self.scrapsF = IndexedBamReader(scrapsFname) if (len(self.subreadsF.movieNames) != 1 or self.scrapsF.movieNames != self.subreadsF.movieNames): raise Exception, "Requires single movie BAM file, and matching scraps" @property def filename(self): return self.subreadsF.filename @property def hasPulseFeatures(self): return (self.subreadsF.hasPulseFeatures() and self.scrapsF.hasPulseFeatures()) @property @cached def sequencingZmws(self): """ Hole numbers for which we have basecalls and an HQ region """ return sorted(set(self.subreadsF.holeNumber)) @property @cached def allSequencingZmws(self): """ Hole numbers for which we have basecalls """ return sorted( set.union(set(self.subreadsF.holeNumber), set(self.scrapsF.holeNumber))) def __getitem__(self, holeNumber): if holeNumber not in self.allSequencingZmws: raise IndexError, "Requested hole number has no entry in this BAM file" subreads = self.subreadsF.readsByHoleNumber(holeNumber) scraps = self.scrapsF.readsByHoleNumber(holeNumber) combined = sorted(subreads + scraps, key=lambda x: x.qStart) return StitchedZmw(self, combined) @property @cached def featureDescs(self): rgs = self.subreadsF.peer.header["RG"] assert len(rgs) == 1 rg = rgs[0] dsEntries = set(pair.split("=")[0] for pair in rg["DS"].split(";")) manifestNames = dsEntries.intersection(_possibleFeatureManifestNames) return { desc.accessorName: desc for desc in FEATURE_DESCS if desc.nameInManifest in manifestNames } @property @cached def frameRate(self): return self.subreadsF.readGroupTable[0].FrameRate @property @cached def movieName(self): mns = list(self.subreadsF.movieNames) assert len(mns) == 1 return mns[0]
def __init__(self): if not op.isfile(self.BAM_FILE): raise SkipTest("Testdata not present") self.f = IndexedBamReader(self.BAM_FILE)