def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) dbi=DBI.init(args.db,Tools.guess_format(args.db)) references=dbi.bamfiles[0].references for i in TableIO.parse(fin,args.format): print i n=0 c_count=0 reads=dbi.query(i,args.method) for read in reads: compatible=Tools.compatible_with_transcript(read,i,references=references,strand=args.strand) print "HT:" for i0,r in enumerate(TableIO.parse(read.reads,"bam2bed12",references=references)): print "READ"+str(i0)+"\t",r print "COMPATIBLE:",compatible,"\n\n" if compatible: c_count+=1 n+=1 print "COMPATIBLE / ALL OVERLAP READS = ",c_count,"/",n print "RATIO\t%.4f"%float(c_count)/n
def BamToBed12Iterator(handle,**kwargs): ''' handle is an bam iterator need references hash if handle is not filename. ''' if type(handle)==type("string"): handle=pysam.Samfile(handle,"rb"); for i in handle: #print i #debug if i.tid<0: continue strand="+" if i.is_reverse: strand="-" score=i.mapq ''' test ''' if kwargs.has_key("references"): if isinstance(kwargs["references"],str): chr=kwargs["references"] else: chr=kwargs["references"][i.tid]; else: try: chr=handle.references[i.tid]; except: chr="chr" if kwargs.has_key("strand"): if kwargs["strand"]=="read1" or kwargs["strand"]=="firstMate": read1=True else: read1=False else: read1=True start=i.pos end=i.aend name=i.qname cds_start=start cds_end=start itemRgb="0,0,0" ''' debug import sys if i.cigar is None: print >>sys.stderr,"why cigar is Nonetype?" print >>sys.stderr,i exit(0) end of debug ''' if i.cigar==None: continue # IGNORE THIS READS? (block_starts,block_sizes)=Tools.cigar_to_coordinates(i.cigar); if i.is_read1 and not read1: strand=Tools.reverse_strand(strand) elif i.is_read2 and read1: strand=Tools.reverse_strand(strand) bed=Bed12([chr,start,end,name,score,strand,cds_start,cds_end,itemRgb,len(block_sizes),block_sizes,block_starts]) yield bed
def BamToBed12Iterator(handle, **kwargs): ''' handle is an bam iterator need references hash if handle is not filename. ''' if type(handle) == type("string"): handle = pysam.Samfile(handle, "rb") for i in handle: #print i #debug if i.tid < 0: continue strand = "+" if i.is_reverse: strand = "-" score = i.mapq ''' test ''' if kwargs.has_key("references"): chr = kwargs["references"][i.tid] else: try: chr = handle.references[i.tid] except: chr = "chr" if kwargs.has_key("strand"): if kwargs["strand"] == "read1" or kwargs["strand"] == "firstMate": read1 = True else: read1 = False else: read1 = True start = i.pos end = i.aend name = i.qname cds_start = start cds_end = start itemRgb = "0,0,0" ''' debug import sys if i.cigar is None: print >>sys.stderr,"why cigar is Nonetype?" print >>sys.stderr,i exit(0) end of debug ''' if i.cigar == None: continue # IGNORE THIS READS? (block_starts, block_sizes) = Tools.cigar_to_coordinates(i.cigar) if i.is_read1 and not read1: strand = Tools.reverse_strand(strand) elif i.is_read2 and read1: strand = Tools.reverse_strand(strand) bed = Bed12([ chr, start, end, name, score, strand, cds_start, cds_end, itemRgb, len(block_sizes), block_sizes, block_starts ]) yield bed
def fopen(file,mode="r",**kwargs): ''' ''' if Tools.guess_format(file)=="bam" and mode=="r": return pysam.Samfile(file,"rb") if mode=="w": return open_output(file) if mode=="r": return open_input(file) return None
def fopen(file, mode="r", **kwargs): ''' ''' if Tools.guess_format(file) == "bam" and mode == "r": return pysam.Samfile(file, "rb") if mode == "w": return open_output(file) if mode == "r": return open_input(file) return None
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() fin = IO.fopen(args.input, "r") out = IO.fopen(args.output, "w") ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) dbi = DBI.init(args.db, Tools.guess_format(args.db)) references = dbi.bamfiles[0].references for i in TableIO.parse(fin, args.format): print i n = 0 c_count = 0 reads = dbi.query(i, args.method) for read in reads: compatible = Tools.compatible_with_transcript( read, i, references=references, strand=args.strand) print "HT:" for i0, r in enumerate( TableIO.parse(read.reads, "bam2bed12", references=references)): print "READ" + str(i0) + "\t", r print "COMPATIBLE:", compatible, "\n\n" if compatible: c_count += 1 n += 1 print "COMPATIBLE / ALL OVERLAP READS = ", c_count, "/", n print "RATIO\t%.4f" % float(c_count) / n
def Main(): ''' This program is a test for TableIO.parse(file.bam,"bam2bed") ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") if args.format=="guess": args.format=Tools.guess_format(args.input) s=TableIO.parse(args.input,args.format) for i in s: print >>out,i
def Main(): global args,out args=ParseArg() dict={} if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout argv=sys.argv argv[0]=argv[0].split("/")[-1] print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :\n#\t"," ".join(argv) if args.query_method: dict["method"]=args.query_method dbi=DBI.init(args.db,args.dbformat) hits=0 query=0 if args.input=="stdin": input=sys.stdin else: input=args.input query_length=0 hits_number=0 for (i0,x) in enumerate(TableIO.parse(input,args.input_format)): if i0%10==0: print >>sys.stderr,"query ",i0," entries\r", print >>out,"QR\t",x hit=0 query+=1 query_length+=len(x) results=dbi.query(x,**dict) compatible=0 #print >>sys.stderr,type(results) if isinstance(results,numpy.ndarray) or isinstance(results,list): if not args.silence: print >>out,"HT\t", for value in results: print >>out,str(value)+",", print >>out,"" hit=1 hits_number+=1 elif isinstance(results,str): if not args.silence: print >>out,"HT\t", print >>out,results hit=1 hits_number+=1 else: this_query_hits=0 for j in results: if not args.silence: print >>out,"HT\t",j, hit=1 hits_number+=1 this_query_hits+=1 if isinstance(j,xplib.Annotation.Bed12) and isinstance(x,xplib.Annotation.Bed12): compatible_binary=Tools.compatible_with_transcript(j,x) if not args.silence: print >>out,"\tCompatible:",compatible_binary if compatible_binary: compatible+=1 else: if not args.silence: print >>out,"" print >>out,"HN\t",this_query_hits if compatible>0: print >>out,"CP\t",compatible if args.dbformat=="tabix": x.chr=x.chr.replace("chr","") for j in dbi.query(x,**dict): print >>out,"HT\t",j hit=1 hits_number+=1 hits+=hit print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits print >>out,"# Query Length:",query_length print >>out,"# Hits Number:",hits_number
def Main(): global args, out args = ParseArg() dict = {} if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout argv = sys.argv argv[0] = argv[0].split("/")[-1] print >> out, "# This data was generated by program ", argv[ 0], "(version %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :\n#\t", " ".join(argv) init_dict = {} if args.dbformat == "guess": if Tools.suffix(args.db) == "gz": args.dbformat = "tabix" args.tabix_format = Tools.guess_format(args.db) else: args.dbformat = Tools.guess_format(args.db) if args.query_method: dict["method"] = args.query_method if args.tabix_format: init_dict["tabix"] = args.tabix_format dbi = DBI.init(args.db, args.dbformat, **init_dict) hits = 0 query = 0 if args.input == "stdin": input = sys.stdin else: input = args.input query_length = 0 hits_number = 0 if (args.input_format == "guess"): args.input_format = Tools.guess_format(args.input) for (i0, x) in enumerate(TableIO.parse(input, args.input_format)): if i0 % 100 == 0: print >> sys.stderr, "query ", i0, " entries\r", print >> out, "QR\t", x hit = 0 query += 1 query_length += len(x) #print dbi;#debug results = dbi.query(x, **dict) #results=dbi.query(x) #DEBUG #print >>sys.stderr,type(results) if isinstance(results, numpy.ndarray) or isinstance(results, list): print >> out, "HT\t", for value in results: print >> out, str(value) + ",", print >> out, "" hit = 1 hits_number += 1 elif isinstance(results, str): print >> out, "HT\t", print >> out, results hit = 1 hits_number += 1 else: for j in results: print >> out, "HT\t", j hit = 1 hits_number += 1 if args.dbformat == "tabix": x.chr = x.chr.replace("chr", "") for j in dbi.query(x, **dict): print >> out, "HT\t", j hit = 1 hits_number += 1 hits += hit print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits print >> out, "# Query Length:", query_length print >> out, "# Hits Number:", hits_number
def compare_reads(isoforms): # global dbi,out isoforms_set=[] chr=isoforms[0].chr min_start=isoforms[0].start max_stop=isoforms[0].stop for i in isoforms: if i.start < min_start: min_start=i.start if i.stop > max_stop: max_stop=i.stop isoforms_set.append(i) # print >>sys.stderr,"debug",i transcript_region=Bed([chr,min_start,max_stop]); print >>out,"REGION\t",chr,"\t",min_start,"\t",max_stop print >>out,"ISOFORM_INPUT_NUMBER\t",len(isoforms_set) ''' reading all the reads in this transcript region ''' reads_set=[] reads_num=0 for i in dbi.query(transcript_region,method="fetch12"): reads_set.append(i) reads_num+=1 ''' compare two sets ''' l=len(isoforms_set) bincodes={} total=reads_num if total==0: total=0.001 for i in reads_set: bincode=0 for j in isoforms_set: k=Tools.compatible_with_transcript(i,j) if k: bincode = (bincode<<1)+1 else: bincode = bincode<<1 if bincodes.has_key(bincode): bincodes[bincode]+=1 else: bincodes[bincode]=1 init=[ 1.0/l for i in range(l) ] proportion=init ''' EM Initialize ''' ''' E step ''' totals=[0.0 for i in range(l)] new_proportion=[0.0 for i in range(l)] iterate_time=0; while(1): totals=[0.0 for i in range(l)] for code in bincodes.keys(): row_total=0.0 for j in range(l): if get_bit_n(j,l,code): row_total+=proportion[j] for j in range(l): if get_bit_n(j,l,code): totals[j]+=bincodes[code] * proportion[j] / row_total for i in range(l): new_proportion[i]=totals[i]/total ''' M step ''' #print >>sys.stderr,"proportion",proportion #debug #print >>sys.stderr,"new_proportion",new_proportion #debug #print >>sys.stderr,"total",total #debug #print >>sys.stderr,"totals",totals #debug dis=distance(proportion,new_proportion) proportion=new_proportion iterate_time+=1 if(dis<1e-05): break; if(args.BYY and iterate_time > 10): break; ''' BYY Hard Cut Algorithm ''' while(args.BYY): totals=[0.0 for i in range(l)] for code in bincodes.keys(): maxj=-1 for j in range(l): if get_bit_n(j,l,code): # totals[j]+=bincodes[code] * proportion[j] / row_total if maxj==-1: maxj=j elif proportion[j] > proportion[maxj]: maxj=j if maxj!=-1: totals[maxj]+=bincodes[code] #new_proportion=[0.0 for i in range(l)] for i in range(l): new_proportion[i]=totals[i]/total ''' M step ''' dis=distance(proportion,new_proportion) if(dis<1e-05): break; proportion=new_proportion # print >>out,proportion ''' print isoforms ''' for i,x in enumerate(isoforms_set): if proportion[i] > args.threshold: if x.score==0.0: x.score=proportion[i] print >>out,"HT\t",x,"\t",proportion[i] else: print >>out,"HT\t",x,"\t",proportion[i] else: if not args.hits_only: if x.score==0.0: x.score=proportion[i] print >>out,"NT\t",x,"\t",proportion[i] else: print >>out,"NT\t",x,"\t",proportion[i] print >>out,"//"
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) gene=DBI.init(args.genetab,args.gene_format); upstream_list=[] downstream_list=[] exons_list=[] introns_list=[] utr3_list=[] utr5_list=[] for g in gene: upstream_list.append(g.upstream(args.upstream)); downstream_list.append(g.downstream(args.downstream)); for e in g.Exons(): exons_list.append(e) for i in g.Introns(): introns_list.append(i) if not (g.utr3() is None): utr3_list.append(g.utr3()) if not (g.utr5() is None): utr5_list.append(g.utr5()) upstream=DBI.init(upstream_list,"bed") downstream=DBI.init(downstream_list,"bed") exons=DBI.init(exons_list,"bed") introns=DBI.init(introns_list,"bed") utr3=DBI.init(utr3_list,"genebed") utr5=DBI.init(utr5_list,"genebed") if args.format=="guess": args.format=Tools.guess_format(args.input) for (i0,i) in enumerate(TableIO.parse(fin,args.format)): if i0==0: if isinstance(i,Bed12): print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" elif isinstance(i,GeneBed): print >>out,"#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" else: print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" print >>out,i, print >>out,"\t",toIDs(gene.query(i)), print >>out,"\t",toIDs(upstream.query(i)), print >>out,"\t",toIDs(downstream.query(i)), print >>out,"\t",toIDs(exons.query(i)), print >>out,"\t",toIDs(introns.query(i)), print >>out,"\t",toIDs(utr3.query(i)), print >>out,"\t",toIDs(utr5.query(i))
def Main(): global args, out args = ParseArg() dict = {} if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout argv = sys.argv argv[0] = argv[0].split("/")[-1] print >> out, "# This data was generated by program ", argv[ 0], "(version %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :\n#\t", " ".join(argv) if args.query_method: dict["method"] = args.query_method dbi = DBI.init(args.db, args.dbformat) hits = 0 query = 0 if args.input == "stdin": input = sys.stdin else: input = args.input query_length = 0 hits_number = 0 for (i0, x) in enumerate(TableIO.parse(input, args.input_format)): if i0 % 10 == 0: print >> sys.stderr, "query ", i0, " entries\r", print >> out, "QR\t", x hit = 0 query += 1 query_length += len(x) results = dbi.query(x, **dict) compatible = 0 #print >>sys.stderr,type(results) if isinstance(results, numpy.ndarray) or isinstance(results, list): if not args.silence: print >> out, "HT\t", for value in results: print >> out, str(value) + ",", print >> out, "" hit = 1 hits_number += 1 elif isinstance(results, str): if not args.silence: print >> out, "HT\t", print >> out, results hit = 1 hits_number += 1 else: this_query_hits = 0 for j in results: if not args.silence: print >> out, "HT\t", j, hit = 1 hits_number += 1 this_query_hits += 1 if isinstance(j, xplib.Annotation.Bed12) and isinstance( x, xplib.Annotation.Bed12): compatible_binary = Tools.compatible_with_transcript(j, x) if not args.silence: print >> out, "\tCompatible:", compatible_binary if compatible_binary: compatible += 1 else: if not args.silence: print >> out, "" print >> out, "HN\t", this_query_hits if compatible > 0: print >> out, "CP\t", compatible if args.dbformat == "tabix": x.chr = x.chr.replace("chr", "") for j in dbi.query(x, **dict): print >> out, "HT\t", j hit = 1 hits_number += 1 hits += hit print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits print >> out, "# Query Length:", query_length print >> out, "# Hits Number:", hits_number
def Main(): """ IO TEMPLATE """ global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >>sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout if args.input == "stdin": fin = sys.stdin else: try: x = args.input.split(".") if x[-1] == "gz": fin = gzip.open(args.input, "r") else: fin = open(args.input, "r") except IOError: print >>sys.stderr, "can't read file", args.input fin = sys.stdin """ END OF IO TEMPLATE """ print >> out, "# This data was generated by program ", sys.argv[0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) gene = DBI.init(args.genetab, args.gene_format) upstream_list = [] downstream_list = [] exons_list = [] introns_list = [] utr3_list = [] utr5_list = [] for g in gene: upstream_list.append(g.upstream(args.upstream)) downstream_list.append(g.downstream(args.downstream)) for e in g.Exons(): exons_list.append(e) for i in g.Introns(): introns_list.append(i) if not (g.utr3() is None): utr3_list.append(g.utr3()) if not (g.utr5() is None): utr5_list.append(g.utr5()) upstream = DBI.init(upstream_list, "bed") downstream = DBI.init(downstream_list, "bed") exons = DBI.init(exons_list, "bed") introns = DBI.init(introns_list, "bed") utr3 = DBI.init(utr3_list, "genebed") utr5 = DBI.init(utr5_list, "genebed") if args.format == "guess": args.format = Tools.guess_format(args.input) for (i0, i) in enumerate(TableIO.parse(fin, args.format)): if i0 == 0: if isinstance(i, Bed12): print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" elif isinstance(i, GeneBed): print >> out, "#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" else: print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" print >> out, i, print >> out, "\t", toIDs(gene.query(i)), print >> out, "\t", toIDs(upstream.query(i)), print >> out, "\t", toIDs(downstream.query(i)), print >> out, "\t", toIDs(exons.query(i)), print >> out, "\t", toIDs(introns.query(i)), print >> out, "\t", toIDs(utr3.query(i)), print >> out, "\t", toIDs(utr5.query(i))
def query(self,x,method='pileup'): if method=='fetch': for bamfile in self.bamfiles: for read in bamfile.fetch(x.chr,x.start,x.stop): if read.tid<0:continue if read.mapq==0:continue strand='+' if read.is_reverse: strand='-' score=read.mapq bed=Bed([bamfile.references[read.tid],read.pos,read.aend,read.qname,score,strand]) yield bed elif method=='fetch12': ''' test version still test Tools.cigar_to_coordinates ''' for bamfile in self.bamfiles: for read in bamfile.fetch(x.chr,x.start,x.stop): if read.tid<0:continue if read.mapq==0:continue chr=bamfile.references[read.tid] strand='+' if read.is_reverse: strand='-' score=read.mapq start=read.pos end=read.aend name=read.qname cds_start=start cds_end=start itemRgb="0,0,0" (block_starts,block_sizes)=Tools.cigar_to_coordinates(read.cigar); bed=Bed12([chr,start,end,name,score,strand,cds_start,cds_end,itemRgb,len(block_sizes),block_sizes,block_starts]) yield bed elif method=="paired_end": for bamfile in self.bamfiles: for fragment in TableIO.parse(bamfile.fetch(x.chr,x.start,x.stop),"bam2fragment",bam=bamfile): yield fragment elif method=='pileup': s=[[0,0,0,0] for row in range(x.stop-x.start)] for bamfile in self.bamfiles: try: A=bamfile.pileup(x.chr,x.start,x.stop) except: print >>sys.stderr,"Can't pile up",x.chr,x.start,x.stop raise StopIteration for pileupcolumn in A: j=pileupcolumn.pos-x.start if j<0: continue if j>x.stop-x.start: break for pileupread in pileupcolumn.pileups: try: if pileupread.is_del: continue if pileupread.indel!=0: continue nt=pileupread.alignment.seq[pileupread.qpos] if BamI.hNtToNum.has_key(nt): k=BamI.hNtToNum[nt] s[j][k]+=1 except: pass for i in s: yield i
def Main(): ''' IO TEMPLATE ''' global args,out,isoforms_set,selected_isoforms_set,reads_set,selected_reads_set,dbi args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) dbi=DBI.init(args.bam,args.format) ''' reading all the isoforms ''' isoforms=[] iterator=TableIO.parse(fin,"bed") beds=[] for i in iterator: beds.append(i) beds.sort() if len(beds)==0: print >>sys.stderr,"error in reading file",args.input exit(1) bed=beds[0] chr=bed.chr min_start=bed.start max_stop=bed.stop j=0 for i in beds: j+=1 if (j%10==0): print >>sys.stderr,"processed %d entries\r"%j, if Tools.overlap(bed,i): if bed.stop < i.stop: bed.stop=i.stop isoforms.append(i) else: compare(isoforms) isoforms=[i] bed=i if len(isoforms)>0: compare(isoforms)
def Main(): global args,out args=ParseArg() dict={} if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout argv=sys.argv argv[0]=argv[0].split("/")[-1] print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :\n#\t"," ".join(argv) init_dict={} if args.dbformat=="guess": if Tools.suffix(args.db)=="gz": args.dbformat="tabix" args.tabix_format=Tools.guess_format(args.db) else: args.dbformat=Tools.guess_format(args.db) if args.query_method: dict["method"]=args.query_method if args.tabix_format: init_dict["tabix"]=args.tabix_format dbi=DBI.init(args.db,args.dbformat,**init_dict) hits=0 query=0 if args.input=="stdin": input=sys.stdin else: input=args.input query_length=0 hits_number=0 if (args.input_format=="guess"): args.input_format=Tools.guess_format(args.input) for (i0,x) in enumerate(TableIO.parse(input,args.input_format)): if i0%100==0: print >>sys.stderr,"query ",i0," entries\r", print >>out,"QR\t",x hit=0 query+=1 query_length+=len(x) #print dbi;#debug results=dbi.query(x,**dict) #results=dbi.query(x) #DEBUG #print >>sys.stderr,type(results) if isinstance(results,numpy.ndarray) or isinstance(results,list): print >>out,"HT\t", for value in results: print >>out,str(value)+",", print >>out,"" hit=1 hits_number+=1 elif isinstance(results,str): print >>out,"HT\t", print >>out,results hit=1 hits_number+=1 else: for j in results: print >>out,"HT\t",j hit=1 hits_number+=1 if args.dbformat=="tabix": x.chr=x.chr.replace("chr","") for j in dbi.query(x,**dict): print >>out,"HT\t",j hit=1 hits_number+=1 hits+=hit print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits print >>out,"# Query Length:",query_length print >>out,"# Hits Number:",hits_number
def query(self, x=None, method="pileup", **dict): if type(x) == type("str"): x = x.split(":") chrom = x[0] start = None end = None if len(x) > 1: b = x[1].split("-") if len(b) == 2: start = int(b[0]) - 1 end = int(b[1]) elif isinstance(x, tuple): chrom = x[CHROM] start = x[CHROMSTART] end = x[CHROMEND] elif x is not None: chrom = x.chr start = x.start end = x.stop if method == "fetch": for bamfile in self.bamfiles: for read in bamfile.fetch(chrom, start, end): if read.tid < 0: continue if read.mapq == 0: continue strand = "+" if read.is_reverse: strand = "-" score = read.mapq bed = Bed([bamfile.references[read.tid], read.pos, read.aend, read.qname, score, strand]) yield bed elif method == "fetch12": """ test version still test Tools.cigar_to_coordinates """ for bamfile in self.bamfiles: for read in bamfile.fetch(chrom, start, end): if read.tid < 0: continue if read.mapq == 0: continue chr = bamfile.references[read.tid] strand = "+" if read.is_reverse: strand = "-" score = read.mapq start = read.pos end = read.aend name = read.qname cds_start = start cds_end = start itemRgb = "0,0,0" (block_starts, block_sizes) = Tools.cigar_to_coordinates(read.cigar) bed = Bed12( [ chr, start, end, name, score, strand, cds_start, cds_end, itemRgb, len(block_sizes), block_sizes, block_starts, ] ) yield bed elif method == "bam1": # fetch read from paired end with strand information for bamfile in self.bamfiles: strand = "read1" if dict.has_key("strand"): # TODO: if bamfiles have different read1 or read2 ? strand = dict["strand"] for bed in TableIO.parse( bamfile.fetch(chrom, start, end), "bam2bed12", references=chrom, strand=strand ): yield bed elif method == "paired_end": for bamfile in self.bamfiles: for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile): yield fragment elif method == "bam2": # yield bed12 for bamfile in self.bamfiles: for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile): if dict.has_key("strand"): yield fragment.toBed12(chr=chrom, strand=dict["strand"]) else: yield fragment.toBed12(chr=chrom) elif method == "bam1tuple": for bamfile in self.bamfiles: strand = "read1" if dict.has_key("strand"): # TODO: if bamfiles have different read1 or read2 ? strand = dict["strand"] for bed in TableIO.parse( bamfile.fetch(chrom, start, end), "bam2bed12tuple", references=chrom, strand=strand ): yield bed elif method == "bam2tuple": for bamfile in self.bamfiles: for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile): if dict.has_key("strand"): yield fragment.toBed12Tuple(chr=chrom, strand=dict["strand"]) else: yield fragment.toBed12Tuple(chr=chrom) elif method == "bam2tuple_fast": for bamfile in self.bamfiles: for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment"): if dict.has_key("strand"): yield fragment.toBed12Tuple(chr=chrom, strand=dict["strand"]) else: yield fragment.toBed12Tuple(chr=chrom) elif method == "pileup": s = [[0, 0, 0, 0] for row in range(end - start)] for bamfile in self.bamfiles: try: A = bamfile.pileup(chrom, start, end) except: print >>sys.stderr, "Can't pile up", chrom, start, end raise StopIteration for pileupcolumn in A: j = pileupcolumn.pos - start if j < 0: continue if j > end - start: break for pileupread in pileupcolumn.pileups: try: if pileupread.is_del: continue if pileupread.indel != 0: continue nt = pileupread.alignment.seq[pileupread.qpos] if BamI.hNtToNum.has_key(nt): k = BamI.hNtToNum[nt] s[j][k] += 1 except: pass for i in s: yield i elif method == "count": s = 0 for bamfile in self.bamfiles: s += bamfile.count(chrom, start, end) yield s elif method == "count_fragment": s = 0 for bamfile in self.bamfiles: for fragment in TableIO.parse(bamfile.fetch(chrom, start, end), "bam2fragment", bam=bamfile): s += 1 yield s elif method == "references": for i in self.bamfiles[0].references: yield i elif method == "lengths": for i in self.bamfiles[0].lengths: yield i