def get_raw_signal(arguments): (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift) = arguments bam = Samfile(reads_file, "rb") signal = np.zeros(window_size) for region in mpbs_region: mid = (region.final + region.initial) // 2 p1 = mid - window_size // 2 p2 = mid + window_size // 2 if p1 <= 0: continue # Fetch raw signal for read in bam.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal
def callbase(bamfile, snpsites, out): BF = Samfile(bamfile, 'rb') #open your bam file SF = open(snpsites, 'r') #the file contain snp sites info RF = open(out, 'w') #resulte file RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n') for i in SF: if i.startswith('#'): continue else: line = ParseSNPsitesLine(i) vcf_pos = line.pos-1 #change 1-base to 0-based vcf_refname = line.chrom print 'processing: %s %s...'%(vcf_refname, str(vcf_pos)) At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0 for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1): if i.pos == vcf_pos: vcf_Rbase = line.Rbase vcf_Abase = line.Abase for j in i.pileups: yourbase = j.alignment.seq[j.qpos] if yourbase == 'A': At += 1 elif yourbase == 'T': Tt += 1 elif yourbase == 'C': Ct += 1 elif yourbase == 'G': Gt += 1 elif yourbase == 'N': Nt += 1 else: othert += 1 RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \ str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \ str(Nt), str(othert))) BF.close()
def single_end_sam_parsing(sam_list, cov, identity_threshold): match = {} to_process = [] if sam_list[0] is None: print "The ene-to-end mapping of SE data produced an error." else: to_process.append(sam_list[0]) if sam_list[1] is None: print "The local mapping mode of SE data produced an error." else: to_process.append(sam_list[1]) for single_sam in to_process: sam = Samfile(single_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(align.cigar) nm = -1 if (query_aligned_len / query_len) * 100 >= cov: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ((align_len - nm) / align_len) * 100 if paired_perc_id >= identity_threshold: match.setdefault(query_name, set()) match[query_name].add(ref_name) sam.close() return match
def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.sg_coefs = None self.bam = Samfile(file_name, "rb")
def test_process_bam_mismatches(): tbam = os.path.join(DATA, "tmp.bam") bam = os.path.join(DATA, "ordered_umi.bam") if os.path.exists(tbam): os.remove(tbam) with captured_output() as (out, err): process_bam(bam, tbam, mismatches=1) assert os.path.exists(tbam) it = iter(out.getvalue().split("\n")) assert it.next().strip() == "1\t9\t10\t4\t2" assert it.next().strip() == "1\t11\t12\t2\t1" assert it.next().strip() == "1\t29\t30\t2\t1" bam_reader = Samfile(tbam) it = iter(bam_reader) r = it.next() assert r.pos == 4 assert r.qname == "read8:UMI_ATTCAGGG" r = it.next() assert r.pos == 9 assert r.qname == "read1:UMI_AAAAAGGG" r = it.next() assert r.pos == 9 assert r.qname == "read4:UMI_AAAGGGGG" r = it.next() assert r.pos == 11 assert r.qname == "read5:UMI_ATTTAGGG" bam_reader.close() os.remove(tbam)
def test_against_fixtures(): # load fixtures from numpy array bampath = "fixture/test.bam" fastapath = "fixture/ref.fa" archive = "fixture/regression.npz" testset = np.load(archive) for q in stats_types: if q in stats_types_withref: x = getattr(pysamstats, "load_" + q)(Samfile(bampath), fafile=fastapath) else: x = getattr(pysamstats, "load_" + q)(Samfile(bampath)) # loop through all fields for key in testset[q].dtype.names: expect = testset[q][key] actual = x[key] try: np.testing.assert_array_equal(expect, actual, err_msg=key) except AssertionError: print(expect[expect != actual]) print(actual[expect != actual]) raise
def test_pileup_pad(): kwargs_nopad = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 20000, 'one_based': False, 'pad': False} kwargs_pad = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 20000, 'one_based': False, 'pad': True} for f, needs_ref in pileup_functions: debug(f.__name__) # test no pad if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_nopad) else: a = f(Samfile('fixture/test.bam'), **kwargs_nopad) eq_(924, a['pos'][0]) eq_(9935, a['pos'][-1]) # test pad if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_pad) else: a = f(Samfile('fixture/test.bam'), **kwargs_pad) eq_(0, a['pos'][0]) eq_(19999, a['pos'][-1]) assert np.all(np.diff(a['pos']) == 1)
def test_pileup_truncate(): kwargs_notrunc = {'chrom': 'Pf3D7_01_v3', 'start': 2000, 'end': 2100, 'one_based': False, 'truncate': False} kwargs_trunc = {'chrom': 'Pf3D7_01_v3', 'start': 2000, 'end': 2100, 'one_based': False, 'truncate': True} for f, needs_ref in pileup_functions: debug(f.__name__) # test no truncate if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_notrunc) else: a = f(Samfile('fixture/test.bam'), **kwargs_notrunc) debug(a[:5]) eq_(1952, a['pos'][0]) eq_(2154, a['pos'][-1]) # test truncate if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs_trunc) else: a = f(Samfile('fixture/test.bam'), **kwargs_trunc) eq_(2000, a['pos'][0]) eq_(2099, a['pos'][-1])
def parse_bam_differential(afn, bfn, regs, step): """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping onto a segment (chr, start, end). No normalization is done at this step. """ abam = Samfile(str(afn), "rb") bbam = Samfile(str(bfn), "rb") acount = [] bcount = [] oldchr = "chr1" for reg in regs: chr, start, end = reg[:3] if chr != oldchr: log("files: %s - %s : %s counted" % (afn, bfn, oldchr)) oldchr = chr # this could be improved for s in xrange(start, end, step): e = s + step an = abam.count(chr, s, e) bn = bbam.count(chr, s, e) acount.append(an) bcount.append(bn) acount.append(-1) bcount.append(-1) log("files: %s - %s : %s counted (finished)" % (afn, bfn, oldchr)) return acount, bcount
def test_binned_pad_wg(): expected = stat_coverage_binned_refimpl( Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa')) actual = pysamstats.stat_coverage_binned(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa')) compare_iterators(expected, actual) kwargs = {'window_size': 200, 'window_offset': 100} for f, needs_ref in binned_functions: debug(f.__name__) if needs_ref: a = f(Samfile('fixture/test.bam'), Fastafile('fixture/ref.fa'), **kwargs) else: a = f(Samfile('fixture/test.bam'), **kwargs) assert sorted(set(a['chrom'])) == [b'Pf3D7_01_v3', b'Pf3D7_02_v3', b'Pf3D7_03_v3'] eq_(100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][0]) eq_(50100, a[a['chrom'] == b'Pf3D7_01_v3']['pos'][-1]) eq_(100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][0]) eq_(60100, a[a['chrom'] == b'Pf3D7_02_v3']['pos'][-1]) eq_(100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][0]) eq_(70100, a[a['chrom'] == b'Pf3D7_03_v3']['pos'][-1])
def bam_fill_seq(args): """ Fill empty sequence with known seqs """ if not args.source_bam: source_bam = args.bam else: source_bam = args.source_bam logging.info('Loading samfile: %s', source_bam) src_seqs = {1: {}, 2: {}} src = pysam.Samfile(source_bam) with src: for rec in src: if rec.is_supplementary: # skip supplementary alignment continue if rec.is_secondary: # skip supplementary alignment continue if rec.query_sequence is None: # empty continue if rec.is_read2: src_seqs[2][rec.qname] = (rec.query_sequence, rec.query_qualities, rec.is_reverse) else: src_seqs[1][rec.qname] = (rec.query_sequence, rec.query_qualities, rec.is_reverse) logging.info('Loaded read1 : %s', len(src_seqs[1])) logging.info('Loaded read2 : %s', len(src_seqs[2])) sam = Samfile(args.bam) if args.output.endswith('.bam'): mode = 'wb' else: mode = 'wh' out = pysam.Samfile(args.output, mode=mode, template=sam) if args.region: it = sam.fetch(region=args.region) else: it = sam for rec in it: qname = rec.qname if rec.query_sequence is None: # only fill when empty ret = src_seqs[2 if rec.is_read2 else 1].get(rec.qname) if ret is not None: seq, qual, is_rev = ret if is_rev != rec.is_reverse: seq = dna_revcomp(seq) if qual is not None: qual = list(reversed(qual)) cigar = Cigar(rec.cigartuples) seq = cigar.hard_clip_seq(seq) if qual is not None: qual = cigar.hard_clip_seq(qual) rec.query_sequence = seq # refill rec.query_qualities = qual out.write(rec)
def bam_read_id(url): '''Read first read id out of a remote bam file. Note: requires a patched version of pysam ''' stream = Samfile(url, 'rb') read = stream.next() return read.qname
def process_bam(abam, bbam, mismatches=0): """Removes duplicate reads characterized by their UMI at any given start location. Args: abam (str): Input bam with potential duplicate UMIs bbam (str): Output bam after removing duplicate UMIs mismatches (Optional[int]): Allowable edit distance between UMIs """ is_indexed(abam) with Samfile(abam, 'rb') as in_bam, Samfile(bbam, 'wb', template=in_bam) as out_bam: for chrom in in_bam.references: print("processing chromosome", chrom, file=sys.stderr) umi_idx = defaultdict(set) read_counts = Counter() for read in in_bam.fetch(chrom): if read.is_unmapped: continue # get the iupac umi sequence try: umi = umi_from_name(read.qname) except UMINotFound: print("You may be processing alignments that haven't been annotated with UMIs!", file=sys.stderr) raise # get actual read start # read.pos accounts for 5' soft clipping if read.is_reverse: # read.alen alignment length accounting for 3' soft clipping # UMIs are then compared to reads with the same start read_start = read.pos + read.alen else: read_start = read.pos # add count for this start; counts all reads read_counts[read_start] += 1 # check if UMI seen if umi in umi_idx[read_start]: continue # check if UMI is similar enough to another that has been seen if mismatches > 0 and is_similar(umi, umi_idx[read_start], mismatches): # do not count; group similar UMIs into one continue # keep track of unique UMIs - set eliminates duplicates umi_idx[read_start].add(umi) out_bam.write(read) # process before and after counts over chrom for start, before_count in sorted(read_counts.items()): print(chrom, start, start + 1, before_count, len(umi_idx[start]), sep="\t")
def bam_variant_aln(args): samfile = Samfile(args.bam) for rec in args.vcf: fp = open(vcf) reader = pyvcf.Reader(fp) self.positions = [] for rec in samfile.fetch(vcf): samfile.getrname(rec.tid) rec
def _open_alignment_file(in_path): """Returns alignment file handle for BAM, SAM, or CRAM""" input_extension = in_path.split(".")[-1].lower() if input_extension == "bam": alignment_file = Samfile(in_path, "rb") elif input_extension == "sam": alignment_file = Samfile(in_path, "r") elif input_extension == "cram": alignment_file = Samfile(in_path, "rc") return alignment_file
def __init__(self, file_name): """ Initializes BamFile. Variables: bam -- Pysam's bam representation. sg_coefs -- Savitzky-Golay coefficients (list). Should be loaded after class initialization. """ self.file_name = file_name self.bam = Samfile(file_name, "rb") self.sg_coefs = None
def parse_bam_absolute(fn, regs): """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping onto a segment (chr, start, end) and normalizes the count by the segment's length. """ bam = Samfile(str(fn), "rb") count = [] for reg in regs: chr, start, end = reg[:3] n = bam.count(chr, start, end) count.append(float(n) / (end - start)) return count
def test_write_hdf5_chrom_dtype(): contig_label = "AS2_scf7180000696055" bampath = "fixture/longcontignames.bam" dtypes = [None, {"chrom": "a20"}, {"chrom": "a20"}] alignments = [Samfile(bampath), Samfile(bampath), bampath] results = [len(contig_label), 20, 20] labels = [contig_label, contig_label, contig_label] for arg in zip(dtypes, alignments, results, labels): assert check_write_hdf5_chrom_dtype(arg)
def create_table(half_ext, feature_summit_file_name, bam_names, bam_counts, bam_list, output_file_name): # Initialization outLoc = "/".join(output_file_name.split("\t")[:-1]) + "/" command = "mkdir -p "+outLoc os.system(command) # Allowed chromosomes chrList = ["chr"+str(e) for e in range(1,23)+["X"]] # Fetching regions featureSummitFile = open(feature_summit_file_name,"r") regionList = [] for line in featureSummitFile: ll = line.strip().split("\t") if(ll[0] not in chrList): continue region = [ll[0], int(ll[1])-half_ext, int(ll[2])+half_ext] if(int(region[1]) < 0): continue regionList.append(region) featureSummitFile.close() # Creating table matrix = [] for i in range(0,len(bam_list)): inputBamFileName = bam_list[i] correctFactor = int(bam_counts[i])/1000000 extension = inputBamFileName.split(".")[-1] if(extension == "bam"): bamFile = Samfile(inputBamFileName,"rb") vec = [] for region in regionList: try: bamSignal = fetchSignal(bamFile, region) / correctFactor except Exception: bamSignal = 0 vec.append(bamSignal) elif(extension == "bw" or extension == "bigwig"): bamFile = pyBigWig.open(inputBamFileName) vec = [] for region in regionList: try: bamSignal = fetchSignalBw(bamFile, region) / correctFactor except Exception: bamSignal = 0 vec.append(bamSignal) else: print("The tool supports only BAM or BIGWIG files.") matrix.append(vec) bamFile.close() outputFile = open(output_file_name,"w") outputFile.write("\t".join(bam_names)+"\n") for j in range(0,len(matrix[0])): vec = [] for i in range(0,len(matrix)): try: vec.append(str(matrix[i][j])) except Exception: vec.append("NA") outputFile.write("\t".join(vec)+"\n") outputFile.close()
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
def removeEdgeMismatches(self, bamFile, minDistance, minBaseQual): startTime = Helper.getTime() minDistance = int(minDistance) counter = 0 j = 0 num_lines = len(self.variantDict) Helper.info( " [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"), str(minDistance)), self.logFile, self.textField) bamFile = Samfile(bamFile, "rb") for varKey in self.variantDict.keys(): variant = self.variantDict[varKey] counter += 1 if counter % 10000 == 0: Helper.status('%s mm parsed ' % counter, self.logFile, self.textField, "grey") keepSNP = False varPos = variant.position - 1 iter = bamFile.pileup(variant.chromosome, variant.position - 1, variant.position) #walks up the region wich overlap this position for x in iter: if x.pos == varPos: for pileupread in x.pileups: #walk through the single reads if not pileupread.is_del and not pileupread.is_refskip: distance = abs( pileupread.alignment.alen - pileupread.query_position ) if pileupread.alignment.is_reverse else pileupread.query_position if distance >= minDistance: #check readBase and Base Quality if pileupread.alignment.query_sequence[ pileupread. query_position] == variant.alt and pileupread.alignment.query_qualities[ pileupread. query_position] >= minBaseQual: #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt: keepSNP = True if keepSNP == False: j += 1 del self.variantDict[varKey] Helper.status('%s of %svariants were deleted' % (j, num_lines), self.logFile, self.textField, "black") Helper.printTimeDiff(startTime, self.logFile, self.textField) bamFile.close()
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") for al in bam: chrom = bam.getrname(al.rname) start = al.pos end = al.aend name = al.qname for hit in rmsk.search(chrom, start, end): print chrom, start, end, name, print hit.chrom, hit.start, hit.end, hit.name
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, "rb") #Iterates over each read instead of each contig outputs = defaultdict(list) #import ipdb; ipdb.set_trace() for aln in samfile.fetch(until_eof = True): ref = samfile.getrname(aln.tid) outputs[ref].append(aln) for ref, alns in outputs.iteritems(): print_reads(alns, ref, samfile.header)
def main(args=None): if args is None: args = sys.argv[1:] f = Samfile(args[0]) header = f.header f.close() reflen = header['SQ'][0]['LN'] BamIO.write(clip(BamIO.parse(args[0]), reflen), args[1], header=header) return 0
def paired_end_sam_parsing(sam_list, cov, identity_threshold): match = {} to_process = [] if sam_list[0] is None: print "The ene-to-end mapping of SE data produced an error." else: to_process.append(sam_list[0]) if sam_list[1] is None: print "The local mapping mode of SE data produced an error." else: to_process.append(sam_list[1]) for paired_sam in to_process: r1_match = {} r2_match = {} sam = Samfile(paired_sam) for align in sam: if align.tid != -1: query_name, query_len, ref_name = align.qname, float( align.rlen), sam.getrname(align.tid) if align.cigar is not None: align_len, query_aligned_len = cigar_parsing(align.cigar) # print query_name, align_len, query_aligned_len nm = -1 if (query_aligned_len / query_len) * 100 >= cov: for coppia in align.tags: if coppia[0] == "NM": nm = float(coppia[1]) if align_len != 0 and nm >= 0: paired_perc_id = ((align_len - nm) / align_len) * 100 if paired_perc_id >= 90: if align.is_read1: r1_match.setdefault(query_name, {}) r1_match[query_name].setdefault(ref_name, []) r1_match[query_name][ref_name].append( paired_perc_id) if align.is_read2: r2_match.setdefault(query_name, {}) r2_match[query_name].setdefault(ref_name, []) r2_match[query_name][ref_name].append( paired_perc_id) sam.close() for query in set(r1_match.keys()).intersection(set(r2_match.keys())): for ref in set(r1_match[query].keys()).intersection( r2_match[query].keys()): average_perc_id = calcola_media( [max(r1_match[query][ref]), max(r2_match[query][ref])]) if average_perc_id >= identity_threshold: match.setdefault(query, set()) match[query].add(ref) return match
def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname)
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # Example 1: # Method: IntervalFile.all_hits() # Report _all_ of the rmsk features that overlap with the BAM alignment for al in bam: strand = "+" if al.is_reverse: strand = "-" i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand) for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75): print "\t".join(str(x) for x in [i, hit])
def compare_stats(impl, refimpl): # no read filters kwargs = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 2000, 'one_based': False} expected = refimpl(Samfile('fixture/test.bam'), **kwargs) actual = impl(Samfile('fixture/test.bam'), **kwargs) compare_iterators(expected, actual) # read filters kwargs['min_mapq'] = 1 kwargs['no_dup'] = True expected = refimpl(Samfile('fixture/test.bam'), **kwargs) actual = impl(Samfile('fixture/test.bam'), **kwargs) compare_iterators(expected, actual)
def main(): bam = Samfile("bedtools/tests/data/NA18152.bam", "rb") rmsk = IntervalFile("bedtools/tests/data/rmsk.hg18.chr21.bed") # Example 1: # Method: IntervalFile.all_hits() # Report _all_ of the rmsk features that overlap with the BAM alignment for al in bam: strand = "+" if al.is_reverse: strand = "-" i = Interval(bam.getrname(al.rname), al.pos, al.aend, strand) for hit in rmsk.all_hits(i, same_strand=True, ovlp_pct=0.75): print "\t".join(str(x) for x in [i,hit])
def expression_dict_from_bam(alias_dict, gene_dict, exp_file_name): # Fetching expression exp_dict = dict() exp_file = Samfile(exp_file_name, "rb") for k in gene_dict.keys(): geneVec = gene_dict[k] gene = geneVec[3] region = [geneVec[0], int(geneVec[1]), int(geneVec[2])] exp = fetch_counts(exp_file, region) exp_dict[gene] = float(exp) / (region[2] - region[1]) exp_file.close() # Returning objects return exp_dict
def annotate(context, bam_path, in_stream, sample, group, cutoff, extendby, prefix, threshold): """Annotate intervals in a BED-file/stream. \b BAM_PATH: Path to BAM-file IN_STREAM: Chanjo-style BED-file with interval definitions """ # connect to the BAM file with Samfile(bam_path) as bam: # user defined sample id or randomly generated sample = (sample or get_sample_id(bam.header) or id_generator()) # step 1: metadata header metadata = dict(sample_id=sample, group_id=group, cutoff=cutoff, coverage_source=path(bam_path).abspath(), extension=extendby) click.echo("#%s" % json.dumps(metadata)) # step 2: annotate list of intervals with coverage and completeness bed_lines = pipe( annotate_bed_stream(bed_stream=in_stream, bam_path=bam_path, cutoff=cutoff, extension=extendby, contig_prefix=prefix, bp_threshold=threshold), map(serialize_interval(bed=True)) # stringify/bedify ) # reduce/write the BED lines for bed_line in bed_lines: click.echo(bed_line)
def bam_uniq(args): """ * BAM file should be sorted in (tid, pos) * (qname, pos, is_unmapped, is_read_2, cigar) is checked * if multiple records exist, primary alignment is selected * scores are not changed """ sam = Samfile(args.bam) # setup output if args.output.endswith('.bam'): mode = 'wb' else: mode = 'wh' out = pysam.Samfile(args.output, mode=mode, template=sam) it = sam # TODO region def get_key(rec): return (rec.qname, rec.pos, rec.is_unmapped, rec.is_read2, rec.cigar) def get_best_rec(recs): for rec in recs: if not rec.is_secondary and not rec.is_supplementary: return rec return rec # No primary alignments were found for (tid, pos), recs in groupby(it, lambda rec: (rec.tid, rec.pos)): # assume position sorted recs1 = sorted(recs, key=get_key) # manual sort by key is needed for key, recs2 in groupby(recs1, get_key): rec = get_best_rec(recs2) out.write(rec)
def compare_stats_withref(impl, refimpl, bam_fn='fixture/test.bam', fasta_fn='fixture/ref.fa'): # no read filters kwargs = {'chrom': 'Pf3D7_01_v3', 'start': 0, 'end': 2000, 'one_based': False} expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) compare_iterators(expected, actual) # read filters kwargs['min_mapq'] = 1 kwargs['no_dup'] = True expected = refimpl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) actual = impl(Samfile(bam_fn), Fastafile(fasta_fn), **kwargs) compare_iterators(expected, actual)
def filter(self, infile, countfile): inbam = Samfile(infile, 'rb') count_labels = [ 'u', 'u-pf', 'u-pf-n', 'u-pf-n-mm%d' % self.max_mismatches, 'u-pf-n-mm%d-mito' % self.max_mismatches, 'mm', 'nm', 'qc-flagged', 'umi-duplicate', 'umi-duplicate-nuclear', 'nuclear-align', 'autosomal-align', 'paired-aligned', 'paired-nuclear-align', 'paired-autosomal-align', 'all-aligned', 'all-mapq-filter' ] logging.debug(count_labels) self.counts = dict([(label, 0) for label in count_labels]) self.chrcounts = defaultdict(int) self.mapqcounts = defaultdict(int) self.samflagcounts = defaultdict(int) self.readlengthcounts = defaultdict(int) for read in inbam: self.process_read(read, inbam) countout = open(countfile, 'a') self.write_dict(countout, self.counts) self.write_dict(countout, self.chrcounts) self.write_dict(countout, self.mapqcounts) self.write_dict(countout, self.samflagcounts) self.write_dict(countout, self.readlengthcounts) countout.close()
def test_read_evidence_variant_matching_gatk_mini_bundle_extract(): handle = Samfile(data_path("gatk_mini_bundle_extract.bam")) loci = [ Locus.from_inclusive_coordinates("20", 10008951), # 0 Locus.from_inclusive_coordinates("20", 10009053), # 1 Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 2 Locus.from_inclusive_coordinates("20", 10006822), # 3 Locus.from_inclusive_coordinates("20", 10006822, 10006823), # 4 ] evidence = PileupCollection.from_bam(handle, loci) eq_(evidence.match_summary(Variant(loci[0], "A", "C")), [('A', 1), ('C', 4)]) eq_( evidence.filter(drop_duplicates=True).match_summary( Variant(loci[0], "A", "C")), [('A', 0), ('C', 3)]) eq_(evidence.match_summary(Variant(loci[1], "A", "C")), [('A', 3), ('C', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), [('A', 3), ('CC', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[1], "A", "")), [('A', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[2], "AT", "")), [('AT', 3), ('', 0)]) eq_(evidence.match_summary(Variant(loci[3], "A", "")), [('A', 2), ('', 6)]) eq_(evidence.match_summary(Variant(loci[4], "AC", "")), [('AC', 2), ('', 6)]) eq_( evidence.match_summary( Variant(loci[4], "AC", ""), lambda e: e.read_attributes().mapping_quality.mean()), [('AC', 60.0), ('', 65.0)])
def get_bc_signal(arguments): (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift, bias_table) = arguments bam = Samfile(reads_file, "rb") genome_data = GenomeData(organism) signal = np.zeros(window_size) # Fetch bias corrected signal for region in mpbs_region: mid = (region.final + region.initial) // 2 p1 = mid - window_size // 2 p2 = mid + window_size // 2 if p1 <= 0: continue # Fetch raw signal _signal = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam, bias_table=bias_table, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(_signal) != window_size: continue # smooth the signal signal = np.add(signal, np.array(_signal)) return signal
def mc_path_call6(args): with open(args.gref) as fp: fasta = Fasta(fp) contigs = {contig.name: contig.seq.upper() for contig in fasta.contigs} with Samfile(args.bam) as sam: smb = SamModelBuilder2(sam, regions=args.regions, min_second_bases=args.min_second_bases, contigs=contigs) if not args.table: hap_depths = {ref.name: args.hap_depth for ref in smb.model.refs} ploidies = {ref.name: args.copy_number for ref in smb.model.refs} else: tab = pd.read_table(args.table) hap_depths = dict(zip(tab.contig, tab.hap_depth)) ploidies = dict(zip(tab.contig, tab.copy_number)) show_model(smb.model, verbosity=args.verbose) from .infer6 import InferModel im = InferModel(smb, hap_depths=hap_depths, ploidies=ploidies) #im.init_best_het() #im.init() #im.run_through_variants() if args.no_phase: im.run_genotyping() else: im.run_haplotyping() start_var = None #start_var = smb.model.refs[0].get_variant(1203) #im.run_phase_variants(start_var=start_var) im.show_variant_info(show_all_variant=True)
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def split_reads(reads, ref_reads, alt_reads): read_results = Counter() ref_bam = Samfile(ref_reads, 'wb', template=reads) alt_bam = Samfile(alt_reads, 'wb', template=reads) prev_phase = None prev_read = None prev_qname = None test = 0 for read in reads.fetch(until_eof=True): test += 1 chrom = read.reference_name snps_on_chrom = snp_dict[chrom] phase = get_phase(read, snps_on_chrom) read_qname = read.qname if read_qname == prev_qname: read_results["tot_read"]+=1 phase_set = set([phase, prev_phase]) phase_set.discard(None) if len(phase_set) == 1: read_phase = phase_set.pop() if read_phase == -1: ref_bam.write(read) ref_bam.write(prev_read) read_results["ref_read"]+=1 elif read_phase == 1: alt_bam.write(read) alt_bam.write(prev_read) read_results["alt_read"]+=1 elif read_phase == 0: read_results['misphased_read']+=1 elif len(phase_set)==0: read_results["no_snps_read"]+=1 else: read_results['misphased_read']+=1 prev_read = read prev_phase = phase prev_qname = read_qname return(read_results)
def subsample(fn, ns=None): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), "subset") sf = Samfile(fn) try: i_weight = float(sf.mapped) / max(ns) print "Read out ", i_weight except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print "Counted ", i_weight i_weight /= float(max(ns)) sf = Samfile(fn) print fn, count, i_weight for i, read in enumerate(sf): key = random() ** i_weight if len(sample) < max(ns): heappush(sample, (key, read, i + count)) else: heappushpop(sample, (key, read, i + count)) count += i for n in ns: if n == min(ns): outdir = outdir_base + "_min" else: outdir = outdir_base + "{:04.1f}M".format(n / 1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[: int(n)] print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count) print fn, "->", outdir stdout.flush() of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf) sample.sort(key=lambda (key, read, pos): (read.tid, read.pos)) for key, read, pos in sampN: of.write(read) of.close() sf.close() return [count for key, read, count in sample]
def _bowtie2_filter(fnam, fastq_path, unmap_out, map_out): """ Divides reads in a map file in two categories: uniquely mapped, and not. Writes them in two files """ try: fhandler = Samfile(fnam) except IOError: raise Exception('ERROR: file "%s" not found' % fnam) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads unmap_out = open(unmap_out, 'w') map_out = open(map_out, 'w') fastq_in = open(fastq_path , 'r') for line in fhandler: line_in = fastq_in.readline() if line.is_unmapped or line.mapq < 4: read = '%s\t%s\t%s\t%s\t%s\n' % ( line_in.split('\t', 1)[0].rstrip('\n')[1:], line.seq, line.qual, '-', '-' ) unmap_out.write(read) else: read = '%s\t%s\t%s\t%s\t%s:%s:%d:%d\n' % ( line.qname, line.seq, line.qual, '1', crm_dict[line.tid], '-' if line.is_reverse else '+', line.pos + 1, len(line.seq)) map_out.write(read) for _ in range(3): fastq_in.readline() unmap_out.close() map_out.close() fastq_in.close()
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, "rb") ref_ids = [samfile.gettid(r) for r in samfile.references] #Iterates over each read instead of each contig reads_to_print = [] for aln in samfile.fetch(until_eof = True): if pair_is_aligned(aln, ref_ids): if args.read_pair == 1 and aln.is_read1: reads_to_print.append(aln) elif args.read_pair == 2 and aln.is_read2: reads_to_print.append(aln) elif args.read_pair == 0: reads_to_print.append(aln) if len(reads_to_print) >= 10000: # Flush the reads collected print_reads(reads_to_print) reads_to_print = [] print_reads(reads_to_print)
def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual): startTime=Helper.getTime() minDistance=int(minDistance) counter=0;j=0 num_lines = len(self.variantDict) Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField) bamFile = Samfile(bamFile, "rb") for varKey in self.variantDict.keys(): variant = self.variantDict[varKey] counter+=1 if counter%10000==0: Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey") keepSNP=False varPos=variant.position-1 iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position) #walks up the region wich overlap this position for x in iter: if x.pos == varPos: for pileupread in x.pileups: #walk through the single reads if not pileupread.is_del and not pileupread.is_refskip: distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position if distance >= minDistance: #check readBase and Base Quality if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual: #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt: keepSNP=True if keepSNP==False: j+=1 del self.variantDict[varKey] Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") Helper.printTimeDiff(startTime, self.logFile, self.textField) bamFile.close()
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, option) ref_ids = [samfile.gettid(r) for r in samfile.references] #Iterates over each read instead of each contig reads_to_print_1 = [] reads_to_print_2 = [] reads_to_print_u = [] for aln in samfile.fetch(until_eof = True): if aln.tid in ref_ids: # This read is aligned if aln.rnext in ref_ids: # The mate is also aligned if aln.is_read1: reads_to_print_1.append(aln) reads_to_print_1 = flush_reads(reads_to_print_1, args.R1) elif aln.is_read2: reads_to_print_2.append(aln) reads_to_print_2 = flush_reads(reads_to_print_2, args.R2) else: reads_to_print_u.append(aln) reads_to_print_u = flush_reads(reads_to_print_u, args.u) print_reads(reads_to_print_1, args.R1) print_reads(reads_to_print_2, args.R2) print_reads(reads_to_print_u, args.u)
def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.bam = None self.bw = None self.sg_coefs = None self.is_bam = False self.is_bw = False if(self.file_name.split(".")[-1].upper() == "BAM"): self.is_bam = True self.bam = Samfile(file_name,"rb") elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"): self.is_bw = True self.bw = BigWigFile(file_name) else: pass # TODO ERROR
def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb") # Check for sortedness, index. # There doesn't seem to be a "public" way to do this right # now, but that's fine because we're going to have to rewrite # it all anyway once the pysam rewrite lands. if not self.peer._hasIndex: raise ValueError, "Specified bam file lacks a bam index---required for this API" self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: self._loadReferenceFasta(referenceFastaFname)
biasTableFFile.close() biasTableR = dict() biasTableRFile = open(biasTableRFileName,"r") for line in biasTableRFile: ll = line.strip().split("\t") biasTableR[ll[0]] = float(ll[1]) biasTableRFile.close() ################################################################################################# # EVALUATING PROTECTION ################################################################################################# # Initialization protectDict = dict() bam = Samfile(dnaseFileName,"rb") # Iterating on MPBSs for mpbs in mpbsList: # Fetching MPBSs grepFileName = outputFileName+"_grepmpbs.bed" to_remove.append(grepFileName) os.system("grep \"\t\""+mpbs+"\"\t\" "+mpbsFileNameBed+" | cut -f 1,2,3 | sort -k1,1 -k2,2n > "+grepFileName) # Intersect with footprints intFileName = outputFileName+"_fpint.bed" to_remove.append(intFileName) os.system("intersectBed -a "+grepFileName+" -b "+fpFileNameBed+" -wa -u > "+intFileName) # Iterating on MPBSs
def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None, genome_seq=None, re_name=None, verbose=False, clean=True, mapper=None, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param out_file1: path to outfile tab separated format containing mapped read1 information :param out_file1: path to outfile tab separated format containing mapped read2 information :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param re_name: name of the restriction enzyme used :param None mapper: software used to map (supported are GEM and BOWTIE2). Guessed from file by default. """ # not nice, dirty fix in order to allow this function to only parse # one SAM file if not out_file1: raise Exception('ERROR: out_file1 should be given\n') if not re_name: raise Exception('ERROR: re_name should be given\n') if not genome_seq: raise Exception('ERROR: genome_seq should be given\n') if (f_names2 and not out_file2) or (not f_names2 and out_file2): raise Exception('ERROR: out_file2 AND f_names2 needed\n') frag_chunk = kwargs.get('frag_chunk', 100000) if verbose: print 'Searching and mapping RE sites to the reference genome' frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk, verbose=verbose) if isinstance(f_names1, str): f_names1 = [f_names1] if isinstance(f_names2, str): f_names2 = [f_names2] if f_names2: fnames = f_names1, f_names2 outfiles = out_file1, out_file2 else: fnames = (f_names1,) outfiles = (out_file1, ) # max number of reads per intermediate files for sorting max_size = 1000000 windows = {} multis = {} procs = [] for read in range(len(fnames)): if verbose: print 'Loading read' + str(read + 1) windows[read] = {} num = 0 # iteration over reads nfile = 0 tmp_files = [] reads = [] for fnam in fnames[read]: try: fhandler = Samfile(fnam) except IOError: print 'WARNING: file "%s" not found' % fnam continue except ValueError: raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam) # get the iteration number of the iterative mapping try: num = int(fnam.split('.')[-1].split(':')[0]) except: num += 1 # set read counter windows[read].setdefault(num, 0) # guess mapper used if not mapper: mapper = fhandler.header['PG'][0]['ID'] if mapper.lower()=='gem': condition = lambda x: x[1][0][0] != 'N' elif mapper.lower() in ['bowtie', 'bowtie2']: condition = lambda x: 'XS' in dict(x) else: warn('WARNING: unrecognized mapper used to generate file\n') condition = lambda x: x[1][1] != 1 if verbose: print 'loading SAM file from %s: %s' % (mapper, fnam) # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 # to empty read buffer for r in fhandler: if r.is_unmapped: continue if condition(r.tags): continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) if positive: pos = r.pos + 1 else: pos = r.pos + len_seq try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos / frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) windows[read][num] += 1 sub_count += 1 if sub_count >= max_size: sub_count = 0 nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) nfile += 1 write_reads_to_file(reads, outfiles[read], tmp_files, nfile) # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile)) if verbose: stdout.write('\n') tmp_name = tmp_files[0] if verbose: print 'Getting Multiple contacts' reads_fh = open(outfiles[read], 'w') ## Also pipe file header # chromosome sizes (in order) reads_fh.write('# Chromosome lengths (order matters):\n') for crm in genome_seq: reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) reads_fh.write('# Mapped\treads count by iteration\n') for size in windows[read]: reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size])) ## Multicontacts tmp_reads_fh = open(tmp_name) try: read_line = tmp_reads_fh.next() except StopIteration: raise StopIteration('ERROR!\n Nothing parsed, check input files and' ' chromosome names (in genome.fasta and SAM/MAP' ' files).') prev_head = read_line.split('\t', 1)[0] prev_head = prev_head.split('~' , 1)[0] prev_read = read_line multis[read] = 0 for read_line in tmp_reads_fh: head = read_line.split('\t', 1)[0] head = head.split('~' , 1)[0] if head == prev_head: multis[read] += 1 prev_read = prev_read.strip() + '|||' + read_line else: reads_fh.write(prev_read) prev_read = read_line prev_head = head reads_fh.write(prev_read) reads_fh.close() if clean: os.system('rm -rf ' + tmp_name) # wait for compression to finish for p in procs: p.communicate() return windows, multis
class _BamReaderBase(ReaderBase): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.gettid, refNames) nRefs = len(refRecords) if nRefs > 0: self._referenceInfoTable = np.rec.fromrecords(zip( refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) else: self._referenceInfoTable = None self._referenceDict = None def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: rgID = rgAsInt(rg["ID"]) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) # spec: we only consider first two components of basecaller version # in "chem" lookup basecallerVersion = ".".join(ds["BASECALLERVERSION"].split(".")[0:2]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] # TODO(dalexander): need FRAMERATEHZ in RG::DS! #rgFrameRate = ds["FRAMERATEHZ"] rgFrameRate = 75.0 readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.int32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O"), ("FrameRate", float)]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } self._pulseFeaturesAvailable = pulseFeaturesInAll_ def _loadProgramInfo(self): pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header.get("PG", []) ] if len(pgRecords) > 0: self._programTable = np.rec.fromrecords( pgRecords, dtype=[("ID" , "O"), ("Version", "O"), ("CommandLine", "O")]) else: self._programTable = None def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, len(c)) for c in ft) bamIdsAndLens = set((c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def _checkFileCompatibility(self): # Verify that this is a "pacbio" BAM file of version at least # 3.0b3 try: checkedVersion = self.version except: raise IncompatibleFile( "This BAM file is incompatible with this API " + "(only PacBio BAM files version >= 3.0b3 are supported)") def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None @property def isUnmapped(self): return not(self.isMapped) @property def isMapped(self): return len(self.peer.header["SQ"]) > 0 @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroupInfo(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" @property def version(self): return self.peer.header["HD"]["pb"] def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return self.peer.header["HD"]["SO"] == "coordinate" @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() @requiresBai def __len__(self): return self.peer.mapped + self.peer.unmapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()
def main(argv=None): """Main script.""" ########################## # COMMAND-LINE ARGUMENTS # ########################## # Get myself program_name = sys.argv[0] if not argv: argv = sys.argv[1:] # Get the cluster type used to control arguments cluster_type = cluster.get_cluster_environment() parser = argparse.ArgumentParser( description=__doc__, add_help=False, epilog=EPILOG, formatter_class=run.CustomFormatter) req = parser.add_argument_group('Required arguments') req.add_argument('-m', '--mode', help='Operation mode', choices=['single', 'multi'], required=True, metavar='mode') req.add_argument('-s', '--snps', help='SNP BED file', required=True, metavar='<BED>') req.add_argument('-r', '--reads', help='Mapped reads file [sam or bam]', required=True, metavar='<[S/B]AM>') uni = parser.add_argument_group('Universal optional arguments') uni.add_argument('-p', '--prefix', help='Prefix for temp files and output', default='TEST', metavar='') uni.add_argument('-b', '--bam', action='store_true', dest='bam', help='Mapped read file type is bam (auto-detected if *.bam)') uni.add_argument('-n', '--noclean', action='store_true', help='Do not delete intermediate files (for debuging)') uni.add_argument('-R', '--random-seed', default=None, type=int, help='Set the state of the randomizer (for testing)') uni.add_argument('-h', '--help', action='help', help='show this help message and exit') mult = parser.add_argument_group('Multi(plex) mode arguments') mult.add_argument('-j', '--jobs', type=int, help='Divide into # of jobs', default=100, metavar='') if cluster_type == 'slurm' or cluster_type == 'torque': mult.add_argument('-w', '--walltime', help='Walltime for each job', default='3:00:00', metavar='') mult.add_argument('-k', '--mem', dest='memory', metavar='', help='Memory for each job', default='5000MB') mult.add_argument('--queue', help='Queue to submit jobs to', default='batch', metavar='') mult.add_argument('--cluster', choices=['torque', 'slurm', 'normal'], help='Which cluster to use, normal uses threads ' + 'on this machine', default=cluster_type) mult.add_argument('--threads', type=int, metavar='', default=cpu_count(), help='Max number of threads to run at a time ' + '(normal mode only).') single = parser.add_argument_group('Single mode arguments') single.add_argument('-f', '--suffix', default='', metavar='', help='Suffix for multiplexing [set automatically]') logging = parser.add_argument_group('Logging options') logging.add_argument('-q', '--quiet', action='store_true', help="Quiet mode, only prints warnings.") logging.add_argument('-v', '--verbose', action='store_true', help="Verbose mode, prints debug info too.") logging.add_argument('--logfile', help='Logfile to write messages too, default is ' + 'STDERR') args = parser.parse_args() if args.random_seed is not None: random.seed(args.random_seed) print("Seed: ", args.random_seed, random.getstate()[1][:10]) ########################################################################### # File Preparations # ########################################################################### # Take care of logging if args.logfile: logme.LOGFILE = args.logfile if args.quiet: logme.MIN_LEVEL = 'warn' elif args.verbose: logme.MIN_LEVEL = 'debug' # Initialize variables prefix = args.prefix + '_' # Make sure we can run ourselves if not run.is_exe(program_name): program_name = run.which(parser.prog) # Set the cluster type if we are in multi mode if args.mode == 'multi' and (cluster_type == 'slurm' or cluster_type == 'torque'): cluster.QUEUE = args.cluster # Check if the read file is sam or bam file_check = args.reads.split('.') file_check[-1] = file_check[-1].lower() sam_path, sam_file = os.path.split(args.reads) if args.reads.endswith('bam') or args.bam: mode = 'rb' else: mode = 'r' ################## # MULTIPLEX MODE # ################## # If we're running in multiplex mode if args.mode == 'multi': logme.log('Splitting sam file {} into {} files.'.format(sam_file, args.jobs)) reads_files = split_samfile(os.path.join(sam_path, sam_file), args.jobs, prefix) logme.log('Splitting complete.') # Create PBS scripts and submit jobs to the cluster subnoclean = ' --noclean' if args.noclean else '' logme.log('Submitting split files to cluster') jobs = [] # Hold job info for later checking for reads_file in reads_files: suffix = reads_file[-4:] command = ("python2 " + program_name + " --mode single --snps " + args.snps + " --reads " + reads_file + " --suffix " + suffix + " --prefix " + args.prefix + subnoclean + ' --bam') if cluster_type == 'normal': jobs.append(cluster.submit(command, name=prefix + suffix, threads=args.threads)) else: jobs.append(cluster.submit(command, name=prefix + suffix, time=args.walltime, cores=1, mem=args.memory, partition=args.queue)) sleep(2) # Pause for two seconds to make sure job is submitted # Now wait and check for all jobs to complete every so long logme.log('Submission done, waiting for jobs to complete.') # First wait for jobs in queue to complete cluster.wait(jobs) sleep(1) # Next, check if any jobs failed failed = [] for i in range(1, args.jobs+1): suffix = str(i).zfill(4) if not os.path.isfile(prefix + suffix + '_done'): failed.append(prefix + suffix) # If any jobs failed, terminate if failed: logme.log('Some jobs failed!', 'critical') return -1 logme.log('Jobs completed.') # Remove 'done' files in case we want to run again. os.system('rm {prefix}*_done'.format(prefix=prefix)) # Once the jobs are done, concatenate all of the counts into one file. # Initialize dictionaries tot_pos_counts = {} tot_neg_counts = {} tot_tot_counts = {} tot_sum_pos = {} tot_sum_neg = {} for i in range(1, args.jobs+1): suffix = str(i).zfill(4) in_counts = prefix + 'SNP_COUNTS_' + suffix # Parse the line to add it to the total file with run.open_zipped(in_counts, 'r') as in_counts: for line in in_counts: line = line.rstrip('\n') line_t = line.split('\t') if 'CHR' in line: continue pos = line_t[0] + '|' + line_t[1] pos_split = line_t[2].split('|') neg_split = line_t[3].split('|') if pos in tot_pos_counts or pos in tot_neg_counts or pos in tot_tot_counts: for j in range(len(pos_split)): tot_pos_counts[pos][j] += int(pos_split[j]) tot_neg_counts[pos][j] += int(neg_split[j]) tot_sum_pos[pos] += int(line_t[4]) tot_sum_neg[pos] += int(line_t[5]) tot_tot_counts[pos] += int(line_t[6]) else: tot_pos_counts[pos] = [0, 0, 0, 0] tot_neg_counts[pos] = [0, 0, 0, 0] tot_tot_counts[pos] = 0 tot_sum_pos[pos] = 0 tot_sum_neg[pos] = 0 for j in range(len(pos_split)): tot_pos_counts[pos][j] += int(pos_split[j]) tot_neg_counts[pos][j] += int(neg_split[j]) tot_sum_pos[pos] += int(line_t[4]) tot_sum_neg[pos] += int(line_t[5]) tot_tot_counts[pos] += int(line_t[6]) # Write out the final concatenated file with run.open_zipped(prefix + 'SNP_COUNTS.txt', 'w') as final_counts: final_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' + 'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n') keys = sorted(tot_pos_counts.keys()) for key in keys: pos = key.split('|') pos_fix = [str(x) for x in tot_pos_counts[key]] neg_fix = [str(x) for x in tot_neg_counts[key]] pos_out = '|'.join(pos_fix) neg_out = '|'.join(neg_fix) final_counts.write(str(pos[0]) + '\t' + str(pos[1]) + '\t' + pos_out + '\t' + neg_out + '\t' + str(tot_sum_pos[key]) + '\t' + str(tot_sum_neg[key]) + '\t' + str(tot_tot_counts[key]) + '\n') # Sort the file numerically os.system('sort -k1,2 -n ' + prefix + 'SNP_COUNTS.txt ' + ' -o ' + prefix + 'SNP_COUNTS.txt') # Clean up intermediate files. if args.noclean is False: cluster.clean() os.system('rm {prefix}*COUNTS_* {prefix}*split_sam_*'.format( prefix=prefix)) ############### # SINGLE MODE # ############### # If we're running in single mode (each job submitted by multiplex mode # will be running in single mode) elif args.mode == 'single': # First read in the information on the SNPs that we're interested in. snps = {} # Initialize a dictionary of SNP positions with run.open_zipped(args.snps) as snp_file: for line in snp_file: line = line.rstrip('\n') line_t = line.split('\t') pos = chrom_to_num(line_t[0]) + '|' + str(line_t[2]) snps[pos] = line_t[3] # This is the dictionary of potential SNPs for each read. potsnp_dict = {} # Now parse the SAM file to extract only reads overlapping SNPs. in_sam = Samfile(args.reads, mode) references = in_sam.references # Faster to make a copy of references. # Trackers to count how many reads are lost at each step indel_skip = 0 nosnp_skip = 0 count = 0 snp_count = 0 ryo_filter = 0 for line in in_sam: count += 1 # Skip lines that overlap indels OR don't match Ns cigarstring = line.cigarstring if 'D' in cigarstring or 'I' in cigarstring: indel_skip += 1 continue # Split the tags to find the MD tag: tags = line.tags for tagname, tagval in tags: if tagname == 'MD' and 'N' in tagval: # Remember that, for now, we're not allowing reads that # overlap insertions/deletions. chrom = references[line.rname] pos = line.pos read = line.seq # We're assuming # correct mapping such that FIRST MATES on the NEGATIVE # STRAND are NEGATIVE, while SECOND MATES on the NEGATIVE # STRAND are POSITIVE. if line.is_reverse: orientation = '-' else: orientation = '+' # Parse the CIGAR string cigar_types, cigar_vals = split_CIGAR(cigarstring) if cigar_types[0] == 'S': MD_start = int(cigar_vals[0]) else: MD_start = 0 # Get the genomic positions corresponding to each base-pair # of the read read_genomic_positions = CIGAR_to_Genomic_Positions( cigar_types, cigar_vals, line.pos+1) # Get the tag data MD_split = re.findall('\d+|\D+', tagval) genome_start = 0 # The snp_pos dictionary will store the 1-base position # => allele snp_pos = {} for i in MD_split: if re.match('\^', i): pass elif i.isalpha(): if i == 'N': snp_pos[read_genomic_positions[genome_start]] = read[MD_start] MD_start += 1 genome_start += 1 else: MD_start += 1 genome_start += 1 else: MD_start += int(i) genome_start += int(i) for i in snp_pos: snp_count += 1 # RYO: START EDIT - Implemented Filter posVal = line.reference_name + '|' + str(i) if posVal not in snps: nosnp_skip += 1 continue # RYO: END EDIT - Implmented Filter snp = '{chr}|{i}\t{snp_pos}\t{orientation}'.format( chr=chrom, i=i, snp_pos=snp_pos[i], orientation=orientation) if line.qname in potsnp_dict: if snp not in potsnp_dict[line.qname]: # RYO EDIT HERE - added conditional so that # pairs of reads are not considered twice if # they both overlap the same snp. potsnp_dict[line.qname].append(snp) else: ryo_filter += 1 else: potsnp_dict[line.qname] = [] potsnp_dict[line.qname].append(snp) in_sam.close() # Log all of the skipped reads logme.log('Total reads: {}'.format(count), 'debug') logme.log('Reads skipped for indels: {}'.format(indel_skip), 'debug') logme.log('Total SNPs checked: {}'.format(snp_count), 'debug') logme.log('SNPs not in SNP list: {}'.format(nosnp_skip), 'debug') logme.log('Ryo filter: {}'.format(ryo_filter), 'debug') # Initialize the counting dictionaries pos_counts = {} neg_counts = {} # Go through the potential SNP dictionary and choose one SNP at random # for those overlapping multiple SNPs if args.random_seed is not None: # Dictionaries are unordered, so must sort for consistent random seed output. keys = sorted(list(potsnp_dict.keys())) else: # Because sorting is slow, only do it if random seed is set, slowdown is about 0.1s per 1 million reads.. keys = list(potsnp_dict.keys()) for key in keys: snp = random.choice(potsnp_dict[key]).split('\t') if snp[0] in snps: if snp[0] in pos_counts or snp[0] in neg_counts: if snp[2] == '+': if snp[1] == 'A': pos_counts[snp[0]][0] += 1 if snp[1] == 'C': pos_counts[snp[0]][1] += 1 if snp[1] == 'G': pos_counts[snp[0]][2] += 1 if snp[1] == 'T': pos_counts[snp[0]][3] += 1 elif snp[2] == '-': if snp[1] == 'A': neg_counts[snp[0]][0] += 1 if snp[1] == 'C': neg_counts[snp[0]][1] += 1 if snp[1] == 'G': neg_counts[snp[0]][2] += 1 if snp[1] == 'T': neg_counts[snp[0]][3] += 1 else: pos_counts[snp[0]] = [0, 0, 0, 0] neg_counts[snp[0]] = [0, 0, 0, 0] if snp[2] == '+': if snp[1] == 'A': pos_counts[snp[0]][0] += 1 if snp[1] == 'C': pos_counts[snp[0]][1] += 1 if snp[1] == 'G': pos_counts[snp[0]][2] += 1 if snp[1] == 'T': pos_counts[snp[0]][3] += 1 elif snp[2] == '-': if snp[1] == 'A': neg_counts[snp[0]][0] += 1 if snp[1] == 'C': neg_counts[snp[0]][1] += 1 if snp[1] == 'G': neg_counts[snp[0]][2] += 1 if snp[1] == 'T': neg_counts[snp[0]][3] += 1 # Open the output file and write the SNP counts to it out_counts = prefix + 'SNP_COUNTS_' + args.suffix if args.suffix \ else prefix + 'SNP_COUNTS.txt' with open(out_counts, 'w') as out_counts: # Write header out_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' + 'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n') # Sort SNP positions and write them keys = sorted(pos_counts.keys()) for key in keys: pos = key.split('|') sum_pos = sum(pos_counts[key]) sum_neg = sum(neg_counts[key]) tot_sum = sum(pos_counts[key]) + sum(neg_counts[key]) pos_fix = [str(x) for x in pos_counts[key]] neg_fix = [str(x) for x in neg_counts[key]] positive = '|'.join(pos_fix) negative = '|'.join(neg_fix) out_counts.write(pos[0] + '\t' + pos[1] + '\t' + positive + '\t' + negative + '\t' + str(sum_pos) + '\t' + str(sum_neg) + '\t' + str(tot_sum) + '\n') if args.suffix: os.system('touch ' + prefix + args.suffix + '_done')
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def split_samfile(sam_file, splits, prefix='', path=''): """Take a sam file and split it splits number of times. :path: Where to put the split files. :prefix: A prefix for the outfile names. :returns: A tuple of job files. """ # Determine how many reads will be in each split sam file. num_lines = count_reads(sam_file) num_reads = int(int(num_lines)/splits) + 1 # Get rid of starting path sam_name = os.path.basename(sam_file) # Subset the SAM file into X number of jobs cnt = 0 currjob = 1 suffix = '.split_sam_' + str(currjob).zfill(4) run_file = os.path.join(path, prefix + sam_name + suffix) rmode = 'rb' if sam_name.split('.')[0] == 'bam' else 'r' wmode = 'wb' # Actually split the file outfiles = [run_file] with Samfile(sam_file, rmode) as in_sam: sam_split = Samfile(run_file, wmode, template=in_sam) for line in in_sam: cnt += 1 if cnt < num_reads: sam_split.write(line) elif cnt == num_reads: # Check if next line is mate-pair. If so, don't split. line2 = next(in_sam) currjob += 1 suffix = '.split_sam_' + str(currjob).zfill(4) run_file = os.path.join(path, prefix + sam_name + suffix) new_sam = Samfile(run_file, wmode, template=in_sam) outfiles.append(run_file) if line.qname == line2.qname: sam_split.write(line) sam_split.write(line2) sam_split.close() cnt = 0 else: sam_split.write(line) sam_split.close() new_sam.write(line2) cnt = 0 sam_split = new_sam sam_split.close() return tuple(outfiles)
def parse_sam(f_names1, f_names2, frags, out_file1, out_file2, genome_seq, re_name, verbose=False, **kwargs): """ Parse sam/bam file using pysam tools. Keep a summary of the results into 2 tab-separated files that will contain 6 columns: read ID, Chromosome, position, strand (either 0 or 1), mapped sequence lebgth, position of the closest upstream RE site, position of the closest downstream RE site :param f_names1: a list of path to sam/bam files corresponding to the mapping of read1, can also be just one file :param f_names1: a list of path to sam/bam files corresponding to the mapping of read2, can also be just one file :param frags: a dictionary generated by :func:`pyatdbit.mapping.restriction_enzymes.map_re_sites`. """ frags = map_re_sites(re_name, genome_seq, verbose=True) frag_chunk = kwargs.get('frag_chunk', 100000) fnames = f_names1, f_names2 outfiles = out_file1, out_file2 for read in range(2): if verbose: print 'Loading read' + str(read + 1) reads = [] for fnam in fnames[read]: if verbose: print 'loading file:', fnam try: fhandler = Samfile(fnam) except IOError: continue i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i).replace('chr', '') i += 1 except ValueError: break for r in fhandler: if r.is_unmapped: continue if r.tags[1][1] != 1: continue positive = not r.is_reverse crm = crm_dict[r.tid] len_seq = len(r.seq) pos = r.pos + (0 if positive else len_seq) try: frag_piece = frags[crm][pos / frag_chunk] except KeyError: # Chromosome not in hash continue idx = bisect(frag_piece, pos) prev_re = frag_piece[idx - 1] next_re = frag_piece[idx] name = r.qname reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % ( name, crm, pos, positive, len_seq, prev_re, next_re)) reads_fh = open(outfiles[read], 'w') reads_fh.write(''.join(sorted(reads))) reads_fh.close() del(reads)
from progressbar import ProgressBar import pandas as pd try: import matplotlib.pyplot as mpl mpl.figure() mpl.plot([1,2]) mpl.close() has_mpl = True except: has_mpl = False from pysam import Samfile if __name__ == "__main__": window_size = int(10e3) mel = Samfile('analysis/on_mel/mel_gdna_bowtie2_dedup.bam') sim = Samfile('analysis/on_mel/sim_gdna_bowtie2_dedup.bam') posns = [] mel_covs = [] sim_covs = [] prog = 0 pbar = ProgressBar(max_value=sum(mel.lengths) + 1) for r, l in zip(mel.references, mel.lengths): for start in range(0, l, window_size): posns.append('{}_{}'.format(r, start)) mel_covs.append(sum(sum(i) for i in mel.count_coverage(r, start, start+window_size))) sim_covs.append(sum(sum(i) for i in sim.count_coverage(r, start, start+window_size))) prog += l
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option("--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir [default: %default]") p.add_option("--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig [default: %default]") p.add_option("--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig [default: %default]") p.add_option("--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig [default: %default]") p.add_option("--astat", default=False, action="store_true", help="create .astat to list repetitiveness [default: %default]") p.add_option("--readids", default=False, action="store_true", help="create file of mapped and unmapped ids [default: %default]") from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print >> fw, "AS {0} {1}".format(ncontigs, totalreads) print >> fw for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments) print >> fw, fill(str(cseq.seq)) print >> fw if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print >> astatfw, "{0}\t{1:.1f}".format(contig, astat) text = fill([qual] * nbases, delimiter=" ", width=30) print >> fw, "BQ\n{0}".format(text) print >> fw rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print >> readsfw, readname rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print >> fw, af print >> fw for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags, fill(aseq)) qs = "QA 1 {0} 1 {0}".format(alen) print >> fw, rd print >> fw print >> fw, qs print >> fw
class _BamReaderBase(object): """ The BamReader class provides a high-level interface to PacBio BAM files. If a PacBio BAM index (bam.pbi file) is present and the user instantiates the BamReader using the reference FASTA as the second argument, the BamReader will provide an interface compatible with CmpH5Reader. """ def _loadReferenceInfo(self): refRecords = self.peer.header["SQ"] refNames = [r["SN"] for r in refRecords] refLengths = [r["LN"] for r in refRecords] refMD5s = [r["M5"] for r in refRecords] refIds = map(self.peer.gettid, refNames) nRefs = len(refRecords) self._referenceInfoTable = np.rec.fromrecords(zip( refIds, refIds, refNames, refNames, refLengths, refMD5s, np.zeros(nRefs, dtype=np.uint32), np.zeros(nRefs, dtype=np.uint32)), dtype=[('ID', '<i8'), ('RefInfoID', '<i8'), ('Name', 'O'), ('FullName', 'O'), ('Length', '<i8'), ('MD5', 'O'), ('StartRow', '<u4'), ('EndRow', '<u4')]) self._referenceDict = {} self._referenceDict.update(zip(refIds, self._referenceInfoTable)) self._referenceDict.update(zip(refNames, self._referenceInfoTable)) def _loadReadGroupInfo(self): rgs = self.peer.header["RG"] readGroupTable_ = [] pulseFeaturesInAll_ = frozenset(PULSE_FEATURE_TAGS.keys()) for rg in rgs: # Regarding RG ID: BLASR currently outputs a hex digest of # 10 nibbles, instead of the 8 which would fit into a # 32-bit word. So we truncate here for the purposes of # cross-referencing within this API and the PacBioBamIndex # API. We do check for a collision below. rgID = int(rg["ID"][:8], 16) rgName = rg["PU"] ds = dict([pair.split("=") for pair in rg["DS"].split(";") if pair != ""]) triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], ds["SOFTWAREVERSION"] rgChem = decodeTriple(*triple) rgReadType = ds["READTYPE"] readGroupTable_.append((rgID, rgName, rgReadType, rgChem)) pulseFeaturesInAll_ = pulseFeaturesInAll_.intersection(ds.keys()) self._readGroupTable = np.rec.fromrecords( readGroupTable_, dtype=[("ID" , np.uint32), ("MovieName" , "O"), ("ReadType" , "O"), ("SequencingChemistry", "O")]) assert len(set(self._readGroupTable.ID)) == len(self._readGroupTable), \ "First 8 chars of read group IDs must be unique!" self._readGroupDict = { rg.ID : rg for rg in self._readGroupTable } self._pulseFeaturesAvailable = pulseFeaturesInAll_ def _loadProgramInfo(self): # TODO: guarantee that these fields are nonoptional in our bams --- check with Marcus # TODO: are we interesting in the PP info? self._programTable = np.rec.fromrecords( [ (pg["ID"], pg.get("VN", None), pg.get("CL", None)) for pg in self.peer.header["PG"] ], dtype=[("ID" , "O"), ("Version", "O"), ("CommandLine", "O")]) def _loadReferenceFasta(self, referenceFastaFname): ft = FastaTable(referenceFastaFname) # Verify that this FASTA is in agreement with the BAM's # reference table---BAM should be a subset. fastaIdsAndLens = set((c.id, c.length) for c in ft) bamIdsAndLens = set((c.Name, c.Length) for c in self.referenceInfoTable) if not bamIdsAndLens.issubset(fastaIdsAndLens): raise ReferenceMismatch, "FASTA file must contain superset of reference contigs in BAM" self.referenceFasta = ft def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = Samfile(fname, "rb") # Check for sortedness, index. # There doesn't seem to be a "public" way to do this right # now, but that's fine because we're going to have to rewrite # it all anyway once the pysam rewrite lands. if not self.peer._hasIndex: raise ValueError, "Specified bam file lacks a bam index---required for this API" self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: self._loadReferenceFasta(referenceFastaFname) @property def isIndexLoaded(self): return self.index is not None @property def isReferenceLoaded(self): return self.referenceFasta is not None def attach(self, fofnFilename): self.basH5Collection = BasH5Collection(fofnFilename) @property def moviesAttached(self): return (self.basH5Collection is not None) @property def alignmentIndex(self): raise UnavailableFeature("BAM has no alignment index") #TODO: change concept to readGroupTable in cmp.h5 @property def movieInfoTable(self): raise Unimplemented() # TODO: change to read group accessor, this is semantically wrong now def movieInfo(self, movieId): raise Unimplemented() @property def movieNames(self): return set([mi.MovieName for mi in self.readGroupTable]) @property def readGroupTable(self): return self._readGroupTable def readGroup(self, readGroupId): return self._readGroupDict[readGroupId] @property def sequencingChemistry(self): """ List of the sequencing chemistries by movie. Order is unspecified. """ return list(self.readGroupTable.SequencingChemistry) #TODO: elide "Info" innames? @property def referenceInfoTable(self): return self._referenceInfoTable #TODO: standard? how about subread instead? why capitalize ccs? # can we standardize this? is cDNA an additional possibility @property def readType(self): """ Either "standard", "CCS", "mixed", or "unknown", to represent the type of PacBio reads aligned in this BAM file. """ readTypes = self.readGroupTable.ReadType if all(readTypes == "SUBREAD"): return "standard" elif all(readTypes == "CCS"): return "CCS" elif all((readTypes == "CCS") | (readTypes == "SUBREAD")): return "mixed" else: return "unknown" #TODO: Marcus needs to put something in the spec for this @property def version(self): raise Unimplemented() #TODO: Marcus needs to put something in the spec for this def versionAtLeast(self, minimalVersion): raise Unimplemented() def softwareVersion(self, programName): raise Unimplemented() @property def isSorted(self): return True @property def isBarcoded(self): raise Unimplemented() @property def isEmpty(self): return (len(self) == 0) # TODO: make this private in cmp.h5 reader def alignmentGroup(self, alnGroupId): raise UnavailableFeature("BAM has no HDF5 groups") def referenceInfo(self, key): return self._referenceDict[key] def atOffset(self, offset): self.peer.seek(offset) return BamAlignment(self, next(self.peer)) def hasPulseFeature(self, featureName): return featureName in self._pulseFeaturesAvailable def pulseFeaturesAvailable(self): return self._pulseFeaturesAvailable @property def barcode(self): raise Unimplemented() @property def barcodeName(self): raise Unimplemented() @property def barcodes(self): raise Unimplemented() def __repr__(self): return "<%s for %s>" % (type(self).__name__, self.filename) def __len__(self): return self.peer.mapped def close(self): if hasattr(self, "file") and self.file is not None: self.file.close() self.file = None def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close()