def parse_bam_differential(afn, bfn, regs, step): """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping onto a segment (chr, start, end). No normalization is done at this step. """ abam = Samfile(str(afn), "rb") bbam = Samfile(str(bfn), "rb") acount = [] bcount = [] oldchr = "chr1" for reg in regs: chr, start, end = reg[:3] if chr != oldchr: log("files: %s - %s : %s counted" % (afn, bfn, oldchr)) oldchr = chr # this could be improved for s in xrange(start, end, step): e = s + step an = abam.count(chr, s, e) bn = bbam.count(chr, s, e) acount.append(an) bcount.append(bn) acount.append(-1) bcount.append(-1) log("files: %s - %s : %s counted (finished)" % (afn, bfn, oldchr)) return acount, bcount
def parse_bam_absolute(fn, regs): """(internal) Parses bam file in absolute mode. Proceeds by counting reads mapping onto a segment (chr, start, end) and normalizes the count by the segment's length. """ bam = Samfile(str(fn), "rb") count = [] for reg in regs: chr, start, end = reg[:3] n = bam.count(chr, start, end) count.append(float(n) / (end - start)) return count
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option( "--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir", ) p.add_option( "--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig", ) p.add_option( "--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig", ) p.add_option( "--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig", ) p.add_option( "--astat", default=False, action="store_true", help="create .astat to list repetitiveness", ) p.add_option( "--readids", default=False, action="store_true", help="create file of mapped and unmapped ids", ) from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print("AS {0} {1}".format(ncontigs, totalreads), file=fw) print(file=fw) for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw) print(fill(str(cseq.seq)), file=fw) print(file=fw) if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print("{0}\t{1:.1f}".format(contig, astat), file=astatfw) text = fill([qual] * nbases, delimiter=" ", width=30) print("BQ\n{0}".format(text), file=fw) print(file=fw) rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print(readname, file=readsfw) rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print(af, file=fw) print(file=fw) for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format( rname, alen, ninfos, ntags, fill(aseq) ) qs = "QA 1 {0} 1 {0}".format(alen) print(rd, file=fw) print(file=fw) print(qs, file=fw) print(file=fw)
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option("--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir [default: %default]") p.add_option("--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig [default: %default]") p.add_option("--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig [default: %default]") p.add_option("--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig [default: %default]") p.add_option("--astat", default=False, action="store_true", help="create .astat to list repetitiveness [default: %default]") p.add_option("--readids", default=False, action="store_true", help="create file of mapped and unmapped ids [default: %default]") from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print >> fw, "AS {0} {1}".format(ncontigs, totalreads) print >> fw for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments) print >> fw, fill(str(cseq.seq)) print >> fw if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print >> astatfw, "{0}\t{1:.1f}".format(contig, astat) text = fill([qual] * nbases, delimiter=" ", width=30) print >> fw, "BQ\n{0}".format(text) print >> fw rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print >> readsfw, readname rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print >> fw, af print >> fw for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags, fill(aseq)) qs = "QA 1 {0} 1 {0}".format(alen) print >> fw, rd print >> fw print >> fw, qs print >> fw
def create_multi_table(max_dist, alias_file_name, genes_file_name, exp_file_name, dsb_file_name, dist_file_name, output_location): # Global Parameters seed(111) promExt = 2000 command = "mkdir -p "+output_location os.system(command) # Allowed chromosomes chrom_list = ["chr"+str(e) for e in range(1,23)+["X"]] # Fetch alias dictionary alias_dict = read_alias_dictionary(alias_file_name) # Fetch expression exp_dict = fetch_expression(exp_file_name) # Fetch distance dist_dict = create_distance_dictionary(dist_file_name) # Input Files allGenesFile = open(genes_file_name, "rU") try: dsbFile = Samfile(dsb_file_name, "rb") except Exception: print("ERROR: Could not open DSB BAM file. Check your Pysam installation.") exit(1) # Output file output_file_name = output_location + "table.txt" outputFile = open(output_file_name, "w") outputFile.write("\t".join(["GENE", "DISTANCE", "EXPRESSION", "DSB"])+"\n") # Iterating in gene file for line in allGenesFile: # Initialization ll = line.strip().split("\t") try: chrom = ll[0]; p1 = int(ll[1]); p2 = int(ll[2]); gene = ll[3].upper(); score = ll[4]; strand = ll[5] except Exception: print("ERROR: The genes file must be a tab-separated bed file with columns: chromosome, start, end, gene_name, score (not used), strand") try: gene = alias_dict[gene] except Exception: pass if(chrom not in chrom_list): continue if(strand == "+"): region = [chrom, p1 - promExt, p1] else: region = [chrom, p2, p2 + promExt] # Fetch distance try: distance = dist_dict[gene] except Exception: continue if(distance >= max_dist): continue # Fetch expression 1 try: exp = exp_dict[alias_dict[gene]] except Exception: continue if(exp <= 0): continue if(distance <= 0): dfactor = 0.9 else: dfactor = distance exp = (200./dfactor) + exp * exp jitt = random() * 3 exp = exp - jitt # Fetch DSB counts if(dsbFile): if(strand == "+"): tss1 = p1 - promExt tss2 = p1 elif(strand == "-"): tss1 = p2 tss2 = p2 + promExt dsbCount = dsbFile.count(chrom, tss1, tss2) dsbCount = (exp + (dsbCount/10.) + (random()*3.)) / 1000. else: continue # Fetch expression 2 try: jitt = 50 * random() * ((100 * dsbCount)**2) exp = exp + jitt except Exception: continue # Writing to file outputFile.write("\t".join([str(e) for e in [gene, distance, exp, dsbCount]])+"\n") # Closing files if(dsbFile): dsbFile.close() allGenesFile.close() outputFile.close() # Script path script_path = "/".join(os.path.realpath(__file__).split("/")[:-1]) + "/" # Creating plots output_dist_dsb_exp = output_location + "3D_dist_dsb_exp.pdf" command = "Rscript "+script_path+"3Dplot.R "+" ".join([str(max_dist), output_file_name, output_dist_dsb_exp]) os.system(command) output_dist_dsb = output_location + "2D_dist_dsb.pdf" output_dist_exp = output_location + "2D_dist_exp.pdf" output_exp_dsb = output_location + "2D_exp_dsb.pdf" command = "Rscript "+script_path+"2Dplot.R "+" ".join([str(max_dist), output_file_name, output_dist_dsb, output_dist_exp, output_exp_dsb]) os.system(command)