def bam_innerdist(bam1, bam2, summaryout=None): iter1 = bam_iter(bam1) iter2 = bam_iter(bam2, quiet=True) distances = {} total = 0 proper = 0 orientation_count = {"+/-": 0, "-/+": 0, "+/+": 0, "-/-": 0} read1_last = None read2_last = None read1 = None read2 = None while True: try: while not read1 or read1_last == read1.qname: read1 = iter1.next() while not read2 or read2_last == read2.qname: read2 = iter2.next() except StopIteration: break if read1.qname != read2.qname: raise ValueError("Error: BAM files aren't properly paired! (%s, %s)\n" % (read1.qname, read2.qname)) read1_last = read1.qname read2_last = read2.qname total += 1 if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid: continue proper += 1 if read1.pos < read2.pos: dist = read2.pos - read1.aend else: dist = read1.pos - read2.aend if summaryout: summaryout.write("%s\n" % dist) if not dist in distances: distances[dist] = 1 else: distances[dist] += 1 orientation = "%s/%s" % ("-" if read1.is_reverse else "+", "-" if read2.is_reverse else "+") orientation_count[orientation] += 1 mean, stdev = counts_mean_stdev(distances) return total, proper, mean, stdev, orientation_count
def bam_renamepair(infile, outfile, delim='/'): bam = pysam.Samfile(infile, "rb") out = pysam.Samfile(outfile, "wb", template=bam) for read in bam_iter(bam): read_renamepair(read, delim) out.write(read) bam.close() out.close()
def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False): if verbose: sys.stderr.write('Input file : %s\n' % infile) sys.stderr.write('Output file : %s\n' % outfile) if failedfile: sys.stderr.write('Failed reads: %s\n' % failedfile) sys.stderr.write('Criteria:\n') for criterion in criteria: sys.stderr.write(' %s\n' % criterion) sys.stderr.write('\n') bamfile = pysam.Samfile(infile, "rb") outfile = pysam.Samfile(outfile, "wb", template=bamfile) if failedfile: failed_out = open(failedfile, 'w') else: failed_out = None passed = 0 failed = 0 def _callback(read): return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname( read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed) for read in bam_iter(bamfile, quiet=True): p = True for criterion in criteria: if not criterion.filter(bamfile, read): p = False failed += 1 if failed_out: failed_out.write('%s\t%s\n' % (read.qname, criterion)) # outfile.write(read_to_unmapped(read)) break if p: passed += 1 outfile.write(read) bamfile.close() outfile.close() if failed_out: failed_out.close() sys.stdout.write("%s kept\n%s failed\n" % (passed, failed)) for criterion in criteria: criterion.close()
def bam_export(bam, mapped=True, unmapped=True, whitelist=None, blacklist=None, fields=None, out=sys.stdout, quiet=False): for read in bam_iter(bam, quiet=quiet): if whitelist and not read.qname in whitelist: continue if blacklist and read.qname in blacklist: continue try: if mapped and not read.is_unmapped: export_read(bam, read, fields, out) elif unmapped and read.is_unmapped: export_read(bam, read, fields, out) except IOError: break
def bam_filter(infile, outfile, criteria, failedfile=None, verbose=False): if verbose: sys.stderr.write('Input file : %s\n' % infile) sys.stderr.write('Output file : %s\n' % outfile) if failedfile: sys.stderr.write('Failed reads: %s\n' % failedfile) sys.stderr.write('Criteria:\n') for criterion in criteria: sys.stderr.write(' %s\n' % criterion) sys.stderr.write('\n') bamfile = pysam.Samfile(infile, "rb") outfile = pysam.Samfile(outfile, "wb", template=bamfile) if failedfile: failed_out = open(failedfile, 'w') else: failed_out = None passed = 0 failed = 0 def _callback(read): return "%s | %s kept,%s failed" % ('%s:%s' % (bamfile.getrname(read.tid), read.pos) if read.tid > -1 else 'unk', passed, failed) for read in bam_iter(bamfile, quiet=True): p = True for criterion in criteria: if not criterion.filter(bamfile, read): p = False failed += 1 if failed_out: failed_out.write('%s\t%s\n' % (read.qname, criterion)) # outfile.write(read_to_unmapped(read)) break if p: passed += 1 outfile.write(read) bamfile.close() outfile.close() if failed_out: failed_out.close() sys.stdout.write("%s kept\n%s failed\n" % (passed, failed)) for criterion in criteria: criterion.close()
def bam_cleancigar(infile, outfile): bam = pysam.Samfile(infile, "rb") out = pysam.Samfile(outfile, "wb", template=bam) total = 0 count = 0 for read in bam_iter(bam): if read_cleancigar(read): count += 1 total += 1 out.write(read) bam.close() out.close() sys.stderr.write('Wrote: %s reads\nAltered: %s\n' % (total, count))
def bam_split(infile, out_template, read_count=1000000, reference=False, quiet=False): bamfile = pysam.Samfile(infile, "rb") outfile = None file_count = 0 count = 0 fname = "" lastref = -1 for read in bam_iter(bamfile): if not outfile or (not reference and count >= read_count) or ( reference and lastref != read.tid): if outfile: outfile.close() file_count += 1 count = 0 if reference: if read.tid >= 0: fname = '%s.%s.bam' % (out_template, bamfile.getrname(read.tid)) else: fname = None else: fname = '%s.%s.bam' % (out_template, file_count) if fname: outfile = pysam.Samfile(fname, "wb", template=bamfile) else: outfile = None if outfile: outfile.write(read) count += 1 lastref = read.tid bamfile.close() if outfile: outfile.close() if not quiet: sys.stderr.write("Split into %s files" % (file_count))
def bam_tofastx(fname, colorspace=False, show_mapped=True, show_unmapped=True, fastq=True, read1=True, read2=True, proper=False): if show_mapped is False and show_unmapped is False: return sam = bam_open(fname) last_key = None for read in bam_iter(sam): if not read1 and read.is_read1: continue if not read2 and read.is_read2: continue if proper and not read.is_proper_pair: continue k = (read.qname, read.seq) if last_key == k: continue show = False if show_mapped and not read.is_unmapped: show = True if show_unmapped and read.is_unmapped: show = True if not show: continue if fastq: write_fastq(read, colorspace=colorspace) else: write_fasta(read, colorspace=colorspace) last_key = k
def bam_junction_count(bam, ref=None, start=None, end=None, out=sys.stdout, quiet=False): last_tid = None junctions = {} for read in bam_iter(bam, ref=ref, start=start, end=end, quiet=quiet): if read.is_unmapped: continue if read.tid != last_tid and junctions: for junction in junctions: sys.stdout.write('%s\t%s\n' % (junction, len(junctions[junction]))) junctions = {} last_tid = read.tid hasgap = False pos = read.pos end = None for op, size in read.cigar: if op == 0: pos += size elif op == 1: pass elif op == 2: pos += size elif op == 3: hasgap = True end = pos + size break elif op == 4: pos += size if not hasgap: continue junction = '%s:%s-%s' % (bam.references[read.tid], pos, end) if not junction in junctions: junctions[junction] = set() junctions[junction].add(read.qname) for junction in junctions: sys.stdout.write('%s\t%s\n' % (junction, len(junctions[junction])))
def bam_removeclipping(infile, outfile): bam = pysam.Samfile(infile, "rb") out = pysam.Samfile(outfile, "wb", template=bam) total = 0 count = 0 unmapped = 0 for read in bam_iter(bam): code = read_removeclipping(read) if code == 1: unmapped += 1 elif code == 2: count += 1 total += 1 out.write(read) bam.close() out.close() sys.stderr.write('Wrote: %s reads\nAltered: %s\nUnmapped: %s\n' % (total, count, unmapped))
def get_counts(self, bam, ref=None, start=None, end=None, quiet=False): for read in bam_iter(bam, ref=ref, start=start, end=end, quiet=quiet, callback=lambda x: '%s:%s (%s)' % (self.cur_chrom, x.pos, len(self.pos_counts) if self.pos_counts else 0)): if read.is_unmapped: continue if self.strand: if self.strand == '+' and read.is_reverse: continue elif self.strand == '-' and not read.is_reverse: continue if self.cur_tid is None or read.tid != self.cur_tid: if self.pos_counts: self.flush() self.pos_counts = array('I', [0,] * bam.lengths[read.tid]) self.cur_tid = read.tid self.cur_chrom = bam.references[read.tid] self._add_read(read) self.flush()
def bam_split(infile, out_template, read_count=1000000, reference=False, quiet=False): bamfile = pysam.Samfile(infile, "rb") outfile = None file_count = 0 count = 0 fname = "" lastref = -1 for read in bam_iter(bamfile): if not outfile or (not reference and count >= read_count) or (reference and lastref != read.tid): if outfile: outfile.close() file_count += 1 count = 0 if reference: if read.tid >= 0: fname = '%s.%s.bam' % (out_template, bamfile.getrname(read.tid)) else: fname = None else: fname = '%s.%s.bam' % (out_template, file_count) if fname: outfile = pysam.Samfile(fname, "wb", template=bamfile) else: outfile = None if outfile: outfile.write(read) count += 1 lastref = read.tid bamfile.close() if outfile: outfile.close() if not quiet: sys.stderr.write("Split into %s files" % (file_count))
def pcrdup_mark(inbam, outbam, fragment=False, countfile=None): cur_pos = None cur_reads = {} total = 0 unique = 0 duplicates = 0 dup_list = set() def callback(read): return '%s, %s, %s - %s' % (total, unique, duplicates, read.qname) for read in bam_iter(bamfile, callback=callback): if not read.is_paired or read.is_read1: total += 1 if read.is_unmapped: __flush_cur_reads(cur_reads, outbam, inbam, countfile) if outbam: outbam.write(read) continue start_pos = (read.tid, read.pos) if fragment: dup_pos = (read.tid, read.pos, '') else: # isize is the insert length, which if this is the first read, will # be the right most part of the second read. If the ends of the reads # are trimmed for QC reasons, only the 5' pos of the first read and the 3' # pos of the second read will be accurate. dup_pos = (read.tid, read.pos, read.isize) if not cur_pos or start_pos != cur_pos: __flush_cur_reads(cur_reads, outbam, inbam, countfile) cur_pos = start_pos cur_reads = {} idx = 0 if not fragment and (read.mate_is_unmapped or not read.is_paired or not read.is_proper_pair or read.isize < 0): # this is a paired file, but the mate isn't paired or proper or mapped # just write it out, no flags to set. if read.qname in dup_list: read.is_duplicate = True dup_list.remove(read.qname) if outbam: outbam.write(read) elif dup_pos in cur_reads: duplicates += 1 if not fragment: dup_list.add(read.qname) cur_reads[dup_pos].append((read.mapq, -idx, read)) else: unique += 1 cur_reads[dup_pos] = [(read.mapq, -idx, read), ] idx += 1 __flush_cur_reads(cur_reads, outbam, inbam, countfile) sys.stdout.write('Total reads:\t%s\n' % total) sys.stdout.write('Unique reads:\t%s\n' % unique) sys.stdout.write('PCR duplicates:\t%s\n' % duplicates)
def _gen2(): def callback(read): return '%s:%s (%s) %s:%s-%s' % (self.bam.getrname(read.tid), read.pos, len(self.buffer), self.cur_chrom, self.cur_start, self.cur_end) for read in bam_iter(self.bam, quiet=self.quiet, callback=callback): yield read
def bam_tobed(fname, out=sys.stdout): bamfile = pysam.Samfile(fname, "rb") for read in bam_iter(bamfile): write_read(read, bamfile.getrname(read.rname), out) bamfile.close()
def filter(self): for read in bam_iter(self.bamfile): yield read self.bamfile.close()
def filter(self, bam): for read in bam_iter(bam): yield read
def _foo2(): for read in bam_iter(bamfile): yield read
def bam_innerdist(bam1, bam2, summaryout=None): iter1 = bam_iter(bam1) iter2 = bam_iter(bam2, quiet=True) distances = {} total = 0 proper = 0 orientation_count = { '+/-': 0, '-/+': 0, '+/+': 0, '-/-': 0, } read1_last = None read2_last = None read1 = None read2 = None while True: try: while not read1 or read1_last == read1.qname: read1 = iter1.next() while not read2 or read2_last == read2.qname: read2 = iter2.next() except StopIteration: break if read1.qname != read2.qname: raise ValueError( "Error: BAM files aren't properly paired! (%s, %s)\n" % (read1.qname, read2.qname)) read1_last = read1.qname read2_last = read2.qname total += 1 if read1.is_unmapped or read2.is_unmapped or read1.tid != read2.tid: continue proper += 1 if read1.pos < read2.pos: dist = read2.pos - read1.aend else: dist = read1.pos - read2.aend if summaryout: summaryout.write('%s\n' % dist) if not dist in distances: distances[dist] = 1 else: distances[dist] += 1 orientation = '%s/%s' % ('-' if read1.is_reverse else '+', '-' if read2.is_reverse else '+') orientation_count[orientation] += 1 mean, stdev = counts_mean_stdev(distances) return total, proper, mean, stdev, orientation_count