def bam_find_regions(bam_name, merge_distance=10, min_read_count=2, only_uniq_starts=False, nostrand=False, out=sys.stdout): bamfile = bam_open(bam_name) region_plus = None region_minus = None for pileup in bam_pileup_iter(bamfile, mask=1540): chrom = bamfile.getrname(pileup.tid) for read in pileup.pileups: if read.is_del: continue if nostrand or not read.alignment.is_reverse: if not region_plus or region_plus.chrom != chrom or (region_plus.end + merge_distance) < pileup.pos: if region_plus and region_plus.read_count >= min_read_count: region_plus.write(out) region_plus = ExpressedRegion(chrom, only_uniq_starts) region_plus.add_column(read, pileup.pos) else: if not region_minus or region_minus.chrom != chrom or (region_minus.end + merge_distance) < pileup.pos: if region_minus and region_minus.read_count >= min_read_count: region_minus.write(out) region_minus = ExpressedRegion(chrom, only_uniq_starts) region_minus.add_column(read, pileup.pos) if region_plus and region_plus.read_count >= min_read_count: region_plus.write(out) if region_minus and region_minus.read_count >= min_read_count: region_minus.write(out) bamfile.close()
def bam_tofastx(fname, colorspace=False, show_mapped=True, show_unmapped=True, fastq=True, read1=True, read2=True, proper=False): if show_mapped is False and show_unmapped is False: return sam = bam_open(fname) last_key = None for read in bam_iter(sam): if not read1 and read.is_read1: continue if not read2 and read.is_read2: continue if proper and not read.is_proper_pair: continue k = (read.qname, read.seq) if last_key == k: continue show = False if show_mapped and not read.is_unmapped: show = True if show_unmapped and read.is_unmapped: show = True if not show: continue if fastq: write_fastq(read, colorspace=colorspace) else: write_fasta(read, colorspace=colorspace) last_key = k
def bam_find_regions(bam_name, merge_distance=10, min_read_count=2, only_uniq_starts=False, nostrand=False, out=sys.stdout): bamfile = bam_open(bam_name) region_plus = None region_minus = None for pileup in bam_pileup_iter(bamfile, mask=1540): chrom = bamfile.getrname(pileup.tid) for read in pileup.pileups: if read.is_del: continue if nostrand or not read.alignment.is_reverse: if not region_plus or region_plus.chrom != chrom or ( region_plus.end + merge_distance) < pileup.pos: if region_plus and region_plus.read_count >= min_read_count: region_plus.write(out) region_plus = ExpressedRegion(chrom, only_uniq_starts) region_plus.add_column(read, pileup.pos) else: if not region_minus or region_minus.chrom != chrom or ( region_minus.end + merge_distance) < pileup.pos: if region_minus and region_minus.read_count >= min_read_count: region_minus.write(out) region_minus = ExpressedRegion(chrom, only_uniq_starts) region_minus.add_column(read, pileup.pos) if region_plus and region_plus.read_count >= min_read_count: region_plus.write(out) if region_minus and region_minus.read_count >= min_read_count: region_minus.write(out) bamfile.close()
def usage(): print __doc__ print """\ Usage: bamutils peakheight {options} bamfile peaks.bed """ sys.exit(1) if __name__ == "__main__": bam_fname = None bed_fname = None for arg in sys.argv[1:]: if arg == '-h': usage() elif not bam_fname and os.path.exists(arg): bam_fname = arg elif not bed_fname and os.path.exists(arg): bed_fname = arg else: print 'Unknown argument: %s' % arg usage() if not bam_fname or not bed_fname: usage() bam = bam_open(bam_fname) with open(bed_fname) as f: bam_peakheight(bam, f) bam.close()
def bam_stats(infiles, gtf_file=None, region=None, delim=None, tags=[], show_all=False, fillin_stats=True): if gtf_file: gtf = GTF(gtf_file) else: gtf = None sys.stderr.write('Calculating Read stats...\n') stats = [BamStats(bam_open(x), gtf, region, delim, tags, show_all=show_all) for x in infiles] sys.stdout.write('\t') for fname, stat in zip(infiles, stats): sys.stdout.write('%s\t\t' % fname) sys.stdout.write('\n') sys.stdout.write('Reads:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.total) sys.stdout.write('\n') sys.stdout.write('Mapped:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.mapped) sys.stdout.write('\n') sys.stdout.write('Unmapped:\t') for stat in stats: sys.stdout.write('%s\t\t' % stat.unmapped) sys.stdout.write('\n') sys.stdout.write('\nFlag distribution\n') validflags = set() maxsize = 0 for flag in flag_descriptions: for stat in stats: if stat.flag_counts.counts[flag] > 0: validflags.add(flag) maxsize = max(maxsize, len(flag_descriptions[flag])) for flag in sorted(validflags): sys.stdout.write("[0x%03x] %-*s" % (flag, maxsize, flag_descriptions[flag])) for stat in stats: sys.stdout.write('\t%s\t%0.2f%%' % (stat.flag_counts.counts[flag], (float(stat.flag_counts.counts[flag]) * 100 / stat.total))) sys.stdout.write('\n') sys.stdout.write('\n') if stats[0].tlen_counts: sys.stdout.write('Template length:') for stat in stats: mean, stdev = counts_mean_stdev(stat.tlen_counts) sys.stdout.write('\t%0.2f\t+/- %0.2f' % (mean, stdev)) sys.stdout.write('\n') sys.stdout.write('\n') stat_tags = {} for tag in stats[0].tagbins: stat_tags[tag] = [] for stat in stats: stat_tags[tag].append(stat.tagbins[tag]) for tag in stat_tags: asc = stats[0].tagbins[tag].asc sys.stdout.write("Ave %s:" % tag) for i, tagbin in enumerate(stat_tags[tag]): sys.stdout.write('\t%s' % tagbin.mean) if i != len(stats): sys.stdout.write('\t') sys.stdout.write('\n') sys.stdout.write("Max %s:" % tag) for i, tagbin in enumerate(stat_tags[tag]): sys.stdout.write('\t%s' % tagbin.max) if i != len(stats): sys.stdout.write('\t') sys.stdout.write('\n') sys.stdout.write('%s distribution:\n' % tag) gens = [] gen_vals = [] last_pcts = [] for stat in stats: gens.append(stat.distribution_gen(tag)) gen_vals.append(None) last_pcts.append(0.0) good = True last = None while good: good = False for i, stat in enumerate(stats): if not gen_vals[i]: try: gen_vals[i] = gens[i].next() except StopIteration: pass vals = [tup[0] for tup in gen_vals if tup] if not vals: continue if asc: minval = min(vals) else: minval = max(vals) if last and type(last) == int and fillin_stats: if asc: last += 1 # fill in missing values while last < minval: sys.stdout.write('%s' % last) for i, stat in enumerate(stats): sys.stdout.write('\t0\t%s' % last_pcts[i]) sys.stdout.write('\n') last += 1 else: last -= 1 # fill in missing values while last > minval: sys.stdout.write('%s' % last) for i, stat in enumerate(stats): sys.stdout.write('\t0\t%s' % last_pcts[i]) sys.stdout.write('\n') last -= 1 last = minval sys.stdout.write(str(minval)) for i, tup in enumerate(gen_vals): if tup and tup[0] == minval: sys.stdout.write('\t%s\t%s' % (tup[1], tup[2])) last_pcts[i] = tup[2] gen_vals[i] = None good = True else: sys.stdout.write('\t0\t%s' % (last_pcts[i])) sys.stdout.write('\n') sys.stdout.write('\n') sys.stdout.write('Reference counts') for stat in stats: sys.stdout.write('\tcount\t') sys.stdout.write('\n') for k in sorted([x for x in stats[0].refs]): sys.stdout.write('%s' % k) for stat in stats: sys.stdout.write('\t%s\t' % stat.refs[k]) sys.stdout.write('\n') if gtf_file: sys.stdout.write('Mapping regions') for stat in stats: sys.stdout.write('\tcount\tCPM') sys.stdout.write('\n') sorted_keys = [x for x in stats[0].regiontagger.counts] sorted_keys.sort() for k in sorted_keys: sys.stdout.write('%s' % k) for stat in stats: sys.stdout.write('\t%s\t%s' % (stat.regiontagger.counts[k], float(stat.regiontagger.counts[k]) / stat.mapped / 1000000)) sys.stdout.write('\n')
usage() elif not ref and os.path.exists(arg) and os.path.exists('%s.fai' % arg): if os.path.exists('%s.fai' % arg): ref = arg else: print "Missing FAI index on %s" % arg usage() elif not regions: regions = BedFile(region=arg) else: print "Unknown option or missing index: %s" % arg usage() except Exception, e: print e usage() if not bam: usage() else: bamobj = bam_open(bam) if profile: import cProfile def func(): bam_basecall(bamobj, ref, min_qual, min_count, regions, mask, quiet, showgaps, showstrand, minorpct, altfreq, variants, TimedProfiler()) sys.stderr.write('Profiling...\n') cProfile.run('func()', profile) else: bam_basecall(bamobj, ref, min_qual, min_count, regions, mask, quiet, showgaps, showstrand, minorpct, altfreq, variants, None) bamobj.close()
Region should be: chr:start-end (start 1-based) """ sys.exit(1) if __name__ == "__main__": fname = None ref = None start = None end = None for arg in sys.argv[1:]: if arg == '-h': usage() elif not fname: if os.path.exists(arg): fname = arg else: usage("%s doesn't exist!") else: chrom, se = arg.split(':') start, end = [int(x) for x in se.split('-')] start = start - 1 if not fname: usage() bamfile = bam_open(fname) bam_junction_count(bamfile, ref, start, end) bamfile.close()
usage() elif not bamfile: if not os.path.exists(arg): usage("Missing or non-existant bamfile: %s" % arg) if not os.path.exists("%s.bai" % arg): usage("Missing bam index (bai) file: %s" % arg) bamfile = arg if not model or not model_arg: usage("Missing model! Must include one of: %s" % ", ".join(count.models)) elif not bamfile: usage("Missing BAM file!") modelobj = count.models[model](model_arg) bam = bam_open(bamfile) modelobj.count( bam, stranded, coverage, uniq_only, fpkm, norm, multiple, whitelist, blacklist, rev_read2=rev_read2, start_only=startonly, ) bam.close()
last = arg elif arg in ['-norm', '-multiple', '-whitelist', '-blacklist', '-library']: last = arg elif arg == '-startonly': startonly = True elif arg == '-coverage': coverage = True elif arg == '-fpkm': fpkm = True elif arg == '-uniq': uniq_only = True elif arg == '-h': usage() elif not bamfile: if not os.path.exists(arg): usage('Missing or non-existant bamfile: %s' % arg) if not os.path.exists('%s.bai' % arg): usage('Missing bam index (bai) file: %s' % arg) bamfile = arg if not model or not model_arg: usage('Missing model! Must include one of: %s' % ', '.join(count.models)) elif not bamfile: usage('Missing BAM file!') modelobj = count.models[model](model_arg) bam = bam_open(bamfile) modelobj.count(bam, library_type, coverage, uniq_only, fpkm, norm, multiple, whitelist, blacklist, start_only=startonly) bam.close()