def main(): ''' main scripts ''' args = parse_argument() PromoterList = [] feature = [] print "# loading and reading the promoter of Gene ... " if args.genefile: for g in TableIO.parse(args.genefile, "genebed"): a = g.promoter(args.bp) PromoterList.append(a) if args.rna: for g in TableIO.parse(args.rna, "genebed"): a = g.promoter(args.bp) PromoterList.append(a) if args.feature: feature = TableIO.parse(args.feature, 'bed') print "# loading and reading Done !" PromoterData = readIntoBinIndex(PromoterList) FeatureData = readIntoBinIndex(feature) for i in TableIO.parse(args.bed, 'bed'): overlapGene, overlapFeature, overlap_string = [], [], '' if i.strand not in ['+', '-']: continue else: OverlapGene = getOverlapFeatures(i, PromoterData) if FeatureData: overlapFeature = getOverlapFeatures(i, FeatureData) for g in overlapGene + overlapFeature: overlap_string += g.id + ';' print i, "\t", overlap_string
def __init__(self, tabix_file_name,**dict): ''' wrapped in DBI.init(filename,"tabix") ''' self.tabix_file_name=tabix_file_name self.dict=dict try: self.data=pysam.Tabixfile(tabix_file_name) except: print >>sys.stderr,"WARNING: Can't init the tabix file",tabix_file_name self.header=None if dict.has_key("header") and dict["header"]==True: f=TableIO.parse(tabix_file_name) h=f.next() l=len(h) for i in range(l): h[i]=h[i].strip() self.header=h f.close() elif dict.has_key("header") and isinstance(dict["header"],list): self.header=dict["header"] elif dict.has_key("header") and isinstance(dict["header"],str): fh=TableIO.parse(dict["header"]) self.header=fh.next() #print >>sys.stderr,self.header self.tabix_format="simple" if self.dict.has_key("tabix"): self.tabix_format=self.dict["tabix"]
def Main(): global args,out,SHIFTSIZE,data args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout SHIFTSIZE=args.shiftsize data={} for x in TableIO.parse(args.chr_length_file,"simple"): data[x[0].strip()]=[0 for row in range((long(x[1]>>SHIFTSIZE)+1))] parseAnnotationFile(args.input) for x in TableIO.parse(args.chr_length_file,"simple"): chrom=x[0].strip() length=long(x[1]) for i,bin in enumerate(data[chrom]): start=i<<SHIFTSIZE stop=(i+1)<<SHIFTSIZE if stop > length: stop=length print >>out,chrom+"\t"+str(start)+"\t"+str(stop)+"\t"+str(bin)
def Main(): global args, out, SHIFTSIZE, data args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout SHIFTSIZE = args.shiftsize data = {} for x in TableIO.parse(args.chr_length_file, "simple"): data[x[0].strip()] = [ 0 for row in range((long(x[1] >> SHIFTSIZE) + 1)) ] parseAnnotationFile(args.input) for x in TableIO.parse(args.chr_length_file, "simple"): chrom = x[0].strip() length = long(x[1]) for i, bin in enumerate(data[chrom]): start = i << SHIFTSIZE stop = (i + 1) << SHIFTSIZE if stop > length: stop = length print >> out, chrom + "\t" + str(start) + "\t" + str( stop) + "\t" + str(bin)
def main(): ''' main scripts ''' args = parse_argument() PromoterList =[] feature = [] print "# loading and reading the promoter of Gene ... " if args.genefile: for g in TableIO.parse(args.genefile, "genebed"): a = g.promoter(args.bp) PromoterList.append(a) if args.rna: for g in TableIO.parse(args.rna, "genebed"): a = g.promoter(args.bp) PromoterList.append(a) if args.feature: feature = TableIO.parse(args.feature, 'bed') print "# loading and reading Done !" PromoterData = readIntoBinIndex(PromoterList) FeatureData = readIntoBinIndex(feature) for i in TableIO.parse(args.bed, 'bed'): overlapGene, overlapFeature, overlap_string = [], [], '' if i.strand not in ['+','-']: continue else: OverlapGene = getOverlapFeatures(i, PromoterData) if FeatureData: overlapFeature = getOverlapFeatures(i, FeatureData) for g in overlapGene + overlapFeature: overlap_string += g.id+';' print i, "\t", overlap_string
def __init__(self, tabix_file_name, **dict): """ wrapped in DBI.init(filename,"tabix") """ self.tabix_file_name = tabix_file_name self.dict = dict try: self.data = pysam.Tabixfile(tabix_file_name) except: print >>sys.stderr, "WARNING: Can't init the tabix file", tabix_file_name self.header = None if dict.has_key("header") and dict["header"] == True: f = TableIO.parse(tabix_file_name) h = f.next() l = len(h) for i in range(l): h[i] = h[i].strip() self.header = h f.close() elif dict.has_key("header") and isinstance(dict["header"], list): self.header = dict["header"] elif dict.has_key("header") and isinstance(dict["header"], str): fh = TableIO.parse(dict["header"]) self.header = fh.next() # print >>sys.stderr,self.header self.tabix_format = "simple" if self.dict.has_key("tabix"): self.tabix_format = self.dict["tabix"]
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) dbi=DBI.init(args.db,Tools.guess_format(args.db)) references=dbi.bamfiles[0].references for i in TableIO.parse(fin,args.format): print i n=0 c_count=0 reads=dbi.query(i,args.method) for read in reads: compatible=Tools.compatible_with_transcript(read,i,references=references,strand=args.strand) print "HT:" for i0,r in enumerate(TableIO.parse(read.reads,"bam2bed12",references=references)): print "READ"+str(i0)+"\t",r print "COMPATIBLE:",compatible,"\n\n" if compatible: c_count+=1 n+=1 print "COMPATIBLE / ALL OVERLAP READS = ",c_count,"/",n print "RATIO\t%.4f"%float(c_count)/n
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' if args.genome is not None: chr_sizes={} for x in TableIO.parse(args.genome): chr_sizes[x[0]]=int(x[1]) bins=b() print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#","\t".join(sys.argv) print >>out,"chr\tstart\tstop\tbinindex\tcoverage\tcoverage_nt\tbinlevel\tbinsize " header=False if (args.format=="metabed"): header=True for i,x in enumerate(TableIO.parse(fin,args.format,header=header)): if i%1000==0: print >>sys.stderr,"reading %d entries\r"%i, bins.append(x) c=b.bin2cov(bins) for chr in sorted(c.keys()): for i,x in enumerate(c[chr]): (start,end)=b.bin2range(i) if args.genome is not None: if start > chr_sizes[chr]: continue if end > chr_sizes[chr]: end=chr_sizes[chr] print >>out,chr,"\t",start,"\t",end,"\t",i,"\t",float(x)/(end-start),"\t",x,"\t",b.bin2level(i),"\t",end-start
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout h=[[0,0],[0,0]] l=[[0,0],[0,0]] d=Utils.readIntoBinIndex(TableIO.parse(args.centromere,"bed")) print >>out,"# Coverage Threshold: ",args.t print >>out,"# Chi2 Threshold:",args.chi2 for i in TableIO.parse(args.input,"oddsratiosnp"): mark=0 for j in Utils.iterOverlapFeature(i,d): mark+=1 if mark>1: mark=1 if sum(i.A_nt_dis) > args.t and sum(i.B_nt_dis) > args.t: if i.odds_ratio > args.chi2: h[mark][1]+=1 else: h[mark][0]+=1 print >>out,i,"\tHigh\t",mark else: print >>out,i,"\tLow\t",mark if i.odds_ratio > args.chi2: l[mark][1]+=1 else: l[mark][0]+=1 print >>out,"# HighOddsRatio:",h[0][1]+l[0][1]+h[1][1]+l[1][1] print >>out,"# LowOddsRatio:",h[0][0]+l[0][0]+h[1][0]+l[1][0] print >>out,"#" print >>out,"# HighCoverage:",sum(h[1])+sum(h[0]) print >>out,"# LowCoverage :",sum(l[1])+sum(l[0]) print >>out,"#" print >>out,"# HighCoverage, HighOddsRatio",h[1][1]+h[0][1] print >>out,"# HighCoverage, LowOddsRatio",h[1][0]+h[0][0] print >>out,"#" print >>out,"# HighCoverage, InCentromere",sum(h[1]) print >>out,"# HighCoverage, NotInCentromere",sum(l[1]) print >>out,"#" print >>out,"# HighCoverage, HighOddsRatio, InCentromere",h[1][1] print >>out,"# HighCoverage, HighOddsRatio, NotInCentromere",h[0][1] print >>out,"# HighCoverage, LowOddsRatio, InCentromere",h[1][0] print >>out,"# HighCoverage, LowOddsRatio, NotInCentromere",h[0][0] print >>out,"# LowCoverage, HighOddsRatio, InCentromere",l[1][1] print >>out,"# LowCoverage, HighOddsRatio, NotInCentromere",l[0][1] print >>out,"# LowCoverage, LowOddsRatio, InCentromere",l[1][0] print >>out,"# LowCoverage, LowOddsRatio, NotInCentromere",l[0][0]
def Main(): global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout h = [[0, 0], [0, 0]] l = [[0, 0], [0, 0]] d = Utils.readIntoBinIndex(TableIO.parse(args.centromere, "bed")) print >> out, "# Coverage Threshold: ", args.t print >> out, "# Chi2 Threshold:", args.chi2 for i in TableIO.parse(args.input, "oddsratiosnp"): mark = 0 for j in Utils.iterOverlapFeature(i, d): mark += 1 if mark > 1: mark = 1 if sum(i.A_nt_dis) > args.t and sum(i.B_nt_dis) > args.t: if i.odds_ratio > args.chi2: h[mark][1] += 1 else: h[mark][0] += 1 print >> out, i, "\tHigh\t", mark else: print >> out, i, "\tLow\t", mark if i.odds_ratio > args.chi2: l[mark][1] += 1 else: l[mark][0] += 1 print >> out, "# HighOddsRatio:", h[0][1] + l[0][1] + h[1][1] + l[1][1] print >> out, "# LowOddsRatio:", h[0][0] + l[0][0] + h[1][0] + l[1][0] print >> out, "#" print >> out, "# HighCoverage:", sum(h[1]) + sum(h[0]) print >> out, "# LowCoverage :", sum(l[1]) + sum(l[0]) print >> out, "#" print >> out, "# HighCoverage, HighOddsRatio", h[1][1] + h[0][1] print >> out, "# HighCoverage, LowOddsRatio", h[1][0] + h[0][0] print >> out, "#" print >> out, "# HighCoverage, InCentromere", sum(h[1]) print >> out, "# HighCoverage, NotInCentromere", sum(l[1]) print >> out, "#" print >> out, "# HighCoverage, HighOddsRatio, InCentromere", h[1][1] print >> out, "# HighCoverage, HighOddsRatio, NotInCentromere", h[0][1] print >> out, "# HighCoverage, LowOddsRatio, InCentromere", h[1][0] print >> out, "# HighCoverage, LowOddsRatio, NotInCentromere", h[0][0] print >> out, "# LowCoverage, HighOddsRatio, InCentromere", l[1][1] print >> out, "# LowCoverage, HighOddsRatio, NotInCentromere", l[0][1] print >> out, "# LowCoverage, LowOddsRatio, InCentromere", l[1][0] print >> out, "# LowCoverage, LowOddsRatio, NotInCentromere", l[0][0]
def Main(): global args,out CellLine=["H1"] HM=("input","H3K27ac","H3K27me3","H3K36me3","H3K4me1","H3K4me3","H3K9me3") marks=[] dbi={} args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout for cell in CellLine: for hm in HM: mark=cell+"_"+hm marks.append(mark) dbi[mark]=DBI.init("/data/zhuxp/bam2x/data/bamlist/"+mark+".bamlist","bamlist") for i,x in enumerate(TableIO.parse(args.input,args.input_format)): print >>out,"QR\t",x if i%100==0: print >>sys.stderr,"query %d entries\r"%i, for mark in marks: print >>out,mark,"\t" for j in DBI.query(x,dbi[mark]): print >>out,"HT\t",j
def parseIterChrom(fn): last_chrom= None fin=open(fn) positions=[] x2s=[] coverage=[] for x in TableIO.parse(fin,'simple'): (chrom,pos,snp,x2,x2_matrix,nt_dist)=x b=x2_matrix.replace("( ","") b=b.replace(" )","") a=b.split(" ") x2_matrix=[] s=0 for y in a: s+=int(y) x2_matrix.append(int(y)) if (last_chrom==None) or (chrom==last_chrom): coverage.append(s) positions.append(pos) x2s.append(x2) last_chrom=chrom continue yield last_chrom,positions,x2s,coverage positions=[] x2s=[] coverage=[] coverage.append(s) x2s.append(x2) positions.append(pos) last_chrom=chrom yield last_chrom,positions,x2s,coverage
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout count={} dbi1=DBI.init(args.db,"bed") # the DBI init file for bed6 file of all kinds of RNA dbi2=DBI.init(args.db_detail,"bed") # the DBI init file for bed12 file of lincRNA and mRNA with intron, exon, UTR genome=Genome('mouse', Release=67, account=None) for bed in TableIO.parse(args.input,args.format): [typ,name,subtype]=annotation(bed,dbi1,dbi2,genome) if count.has_key(typ): count[typ]+=1 else: count[typ]=1 print >>out, "\t".join (str(f) for f in [bed.chr,bed.start,bed.stop,bed.id,name,bed.strand,typ, subtype]) print >>out, "\n".join ("#"+typ+"\t%d"%(count[typ]) for typ in count.keys())
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) dbi=[]; for i,bam in enumerate(args.bams): print >>out,"# SAMPLE_"+str(i+1)+" BAM File:",bam dbi.append(DBI.init(bam,"bam")) print >>out,"#",VCF.header(), for i,bam in enumerate(args.bams): print >>out,"\t","Sample_"+str(i+1), print >>out,"" for i,vcf in enumerate(TableIO.parse(fin,"vcf")): vcf.chr=args.chr_prefix+vcf.chr if(i%100==0): print >>sys.stderr,"processing",i,"vcf\r", print >>out,vcf, for d in dbi: print >>out,"\t", for r in d.query(vcf): print >>out,format(r), print >>out,""
def Main(): global args,out MAX_SCORE=200 args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout i=0 x=args.input.split("/") name=x[-1] name=name.replace(".OddsRatio.peaks","") name=name.replace(".LogR.Peaks","") name=name.replace(".out","") name=args.prefix+name for x in TableIO.parse(args.input,"simple"): if x[0]=="REGION": i+=1 if x[4]==0: score=MAX_SCORE else: score=-10*math.log(x[4],10) if score > MAX_SCORE: score=MAX_SCORE ID=name+"_ORP_"+str(i) print >>out,x[1]+"\t"+str(x[2])+"\t"+str(x[3])+"\t"+ID+"\t", print >>out,"%.2f"%score
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout if args.input == "stdin": fin = sys.stdin else: try: x = args.input.split(".") if x[-1] == "gz": fin = gzip.open(args.input, "r") else: fin = open(args.input, "r") except IOError: print >> sys.stderr, "can't read file", args.input fin = sys.stdin ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) h1 = {} h2 = {} for i in TableIO.parse(fin): h1[i[args.c1 - 1]] = i for j in TableIO.parse(args.input2): h2[j[args.c2 - 1]] = j if h1.has_key(j[args.c2 - 1]): print >> out, TableIO.format_string( h1[j[args.c2 - 1]]) + "\t" + TableIO.format_string(j)
def main(): ''' main scripts ''' args = parse_argument() bed = args.bed gene = readIntoBinIndex(TableIO.parse( args.genefile, "genebed") ) for i in TableIO.parse(args.bed, 'bed'): if i.strand not in ['+','-']: continue else: OverlapGene = getOverlapFeatures(i, gene) Overlap_dict = Classify_Overlap(i, OverlapGene) overlap_string = '' for k,v in Overlap_dict.iteritems(): if v: overlap_string += "".join([ str(k+'_'+each)+';' for each in v]) if not overlap_string: overlap_string = 'intergenic' print i, overlap_string
def Main(): global args args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout dbi=DBI.init(args.db,"genebed") count={} count["Intergenic"]=0 for x in TableIO.parse(args.input,args.input_format): flag=0 gene="" for hit in dbi.query(x): flag=1 if hit.align_id==gene: continue gene=hit.align_id #print hit #print hit.cds_start,hit.cds_stop if (hit.cds_start==hit.cds_stop): if hit.align_id[0:3]=="Mir": loc="MiRNA" else: loc="Non-coding" elif hit.strand=="+": if x.stop<=hit.cds_start: loc="5-UTR" elif x.start>=hit.cds_stop: loc="3-UTR" else: loc=judge_exon(x,hit) else: if x.stop<=hit.cds_start: loc="3-UTR" elif x.start>=hit.cds_stop: loc="5-UTR" else: loc=judge_exon(x,hit) print >>out,"\t".join (str(f) for f in [x.chr,x.start,x.stop,x.id,x.score,x.strand,hit.align_id,loc]) if count.has_key(loc): count[loc]+=1 else: count[loc]=1 if flag==0: print >>out, "\t".join (str(f) for f in [x.chr,x.start,x.stop,x.id,x.score,x.strand,"None","Intergenic"]) count["Intergenic"]+=1 out2=open(args.output.split(".")[0]+".cisStat","w") for key in sorted(count.keys()): print >>out2,key+"\t"+str(count[key])
def Main(): args = ParseArg() if len(args.data) != len(args.name): print >> sys.stderr, "ERROR: Number of data is not the same as number of names!" sys.exit(0) # store data information data = {} total_reads = {} for i in range(len(args.data)): temp_name = args.name[i] print >> sys.stderr, "\n Reading data file:" + temp_name + "..." total_reads[temp_name] = 0 if args.format[i] == "bam": total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i]) ]) else: Format = "bed" for b in TableIO.parse(args.data[i], Format): total_reads[temp_name] += 1 if total_reads[temp_name] % 50000 == 0: print >> sys.stderr, " reading %d reads..\r" % ( total_reads[temp_name]), data[temp_name] = DBI.init(args.data[i], args.format[i]) output = open(args.output, 'w') Input = open(args.input, 'r') lines = Input.read().split("\n") # header header = ["chr", "start", "end", "type", "name", "subtype", "count" ] + data.keys() print >> output, "\t".join(g + "_%d" % (f) for f in [1, 2] for g in header) + "\tinteraction\tp-value" num = 0 print >> sys.stderr, "Start process interactions:" for l in lines: if l.strip() == '': continue l = l.strip().split('\t') num = num + 1 if l[0] == "chrM" or l[7] == "chrM": continue C1 = Bed([l[0], int(l[1]), int(l[2])]) C2 = Bed([l[7], int(l[8]), int(l[9])]) rpkm1 = "\t".join( str(f) for f in [RPKM(C1, data[n], total_reads[n], n) for n in data.keys()]) rpkm2 = "\t".join( str(f) for f in [RPKM(C2, data[n], total_reads[n], n) for n in data.keys()]) print >> output, "\t".join( str(f) for f in l[:7] + [rpkm1] + l[7:14] + [rpkm2, l[14], l[15]]) if num % 1000 == 0: print >> sys.stderr, " Output interaction: %d\r" % (num),
def main(): ''' main scripts ''' args = parse_argument() bed = args.bed gene = readIntoBinIndex(TableIO.parse(args.genefile, "genebed")) for i in TableIO.parse(args.bed, 'bed'): if i.strand not in ['+', '-']: continue else: OverlapGene = getOverlapFeatures(i, gene) Overlap_dict = Classify_Overlap(i, OverlapGene) overlap_string = '' for k, v in Overlap_dict.iteritems(): if v: overlap_string += "".join( [str(k + '_' + each) + ';' for each in v]) if not overlap_string: overlap_string = 'intergenic' print i, overlap_string
def toBed12Tuple(self,chr="chr",strand="read2"): ''' test now ''' from xplib import TableIO x=list() for i in TableIO.parse(self.reads,"bam2bed12tuple",references=chr,strand=strand): x.append(i) return x
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout if args.input == "stdin": fin = sys.stdin else: try: x = args.input.split(".") if x[-1] == "gz": fin = gzip.open(args.input, "r") else: fin = open(args.input, "r") except IOError: print >> sys.stderr, "can't read file", args.input fin = sys.stdin ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) enhancer_dbi = DBI.init(args.enhancer_tabix, "tabix", tabix="metabed", header=re.sub(".gz$", ".header", args.enhancer_tabix)) promoter_dbi = DBI.init(args.promoter_tabix, "tabix", tabix="metabed", header=re.sub(".gz$", ".header", args.promoter_tabix)) for i in TableIO.parse(fin, args.format): tss = i.tss() tss.start -= args.size tss.stop += args.size if tss.start < 0: tss.start = 0 tss.id += "_near" + str(args.size) print "QR\t", tss for e in enhancer_dbi.query(tss): print "EH\t", e for p in promoter_dbi.query(tss): print "PM\t", p
def query(self,x,**kwargs): ''' yield the overlap feature in tabix index files ''' try: for item in TableIO.parse(self.data.fetch(x.chr,x.start,x.stop),format=self.tabix_format,header=self.header): yield item except: raise StopIteration
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) h1={} h2={} for i in TableIO.parse(fin): h1[i[args.c1-1]]=i for j in TableIO.parse(args.input2): h2[j[args.c2-1]]=j if h1.has_key(j[args.c2-1]): print >>out,TableIO.format_string(h1[j[args.c2-1]])+"\t"+TableIO.format_string(j)
def __init__(self,file,**dict): ''' Wrapped in xplib.DBI.init() ''' if type(file)==type([1,2,3]): f=file else: format=dict['format'] f=TableIO.parse(file,format) self.data=binindex(f)
def test(): if len(sys.argv)==1: print >>sys.stderr,"Usage: Utils.py file.bed" exit() a=TableIO.parse(sys.argv[1],'bed') data=readIntoBinIndex(a) bed=Bed(["chr1",100000,200000,".",".","."]) g=getOverlapFeatures(bed,data) print "Overlap with",bed for i in g: print i
def test(): if len(sys.argv) == 1: print >> sys.stderr, "Usage: Utils.py file.bed" exit() a = TableIO.parse(sys.argv[1], 'bed') data = readIntoBinIndex(a) bed = Bed(["chr1", 100000, 200000, ".", ".", "."]) g = getOverlapFeatures(bed, data) print "Overlap with", bed for i in g: print i
def query(self,x): ''' yield the overlap feature in tabix index files ''' f="simple" if self.dict.has_key("tabix"): f=self.dict["tabix"] try: for item in TableIO.parse(self.data.fetch(x.chr,x.start,x.stop),format=f,header=self.header): yield item except: raise StopIteration
def parseIterRegion(fn): ''' yield each region ( gap < args.gap) ''' last_chrom = None last_position = 0 fin = open(fn) positions = [] x2s = [] snps = [] matrix_x2 = [] for x in TableIO.parse(fin, 'simple'): (chrom, pos, snp, x2, x2_matrix, nt_dist) = x ''' add filter here ''' b = x2_matrix.replace("( ", "") b = b.replace(" )", "") a = b.split(" ") x2_matrix = [] for y in a: x2_matrix.append(int(y)) t = x2_matrix CV = int(t[0]) + int(t[1]) + int(t[2]) + int(t[3]) MR = (float(t[1]) + float(t[3])) / CV if (CV > args.reads or MR < 0.05): continue # end of filter if (last_chrom == None) or (chrom == last_chrom and pos - last_position < args.gap): matrix_x2.append(x2_matrix) positions.append(pos) x2s.append(x2) snps.append(snp) last_chrom = chrom last_position = pos continue yield last_chrom, positions, x2s, matrix_x2, snps positions = [] x2s = [] matrix_x2 = [] snps = [] matrix_x2.append(x2_matrix) x2s.append(x2) positions.append(pos) snps.append(snp) last_chrom = chrom last_position = pos yield last_chrom, positions, x2s, matrix_x2, snps
def Main(): global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout for i in TableIO.parse(args.input, "oddsratiosnp"): print i
def parseAnnotationFile(fn): format=args.format if format=="bam": format="bam2bed" for x in TableIO.parse(fn,format): if not data.has_key(x.chr): print >>sys.stderr,"ignore",x print >>sys.stderr,"since this chromosome size is not in ",args.chr_length_file continue bin_start=x.start>>SHIFTSIZE bin_stop=x.stop>>SHIFTSIZE for bin in range(bin_start,bin_stop+1): data[x.chr][bin]+=1
def parseAnnotationFile(fn): format = args.format if format == "bam": format = "bam2bed" for x in TableIO.parse(fn, format): if not data.has_key(x.chr): print >> sys.stderr, "ignore", x print >> sys.stderr, "since this chromosome size is not in ", args.chr_length_file continue bin_start = x.start >> SHIFTSIZE bin_stop = x.stop >> SHIFTSIZE for bin in range(bin_start, bin_stop + 1): data[x.chr][bin] += 1
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout for i in TableIO.parse(args.input,'transunit'): print >>out,i
def parseIterRegion(fn): ''' yield each region ( gap < args.gap) ''' last_chrom= None last_position = 0 fin=open(fn) positions=[] x2s=[] snps=[] matrix_x2=[] for x in TableIO.parse(fin,'simple'): (chrom,pos,snp,x2,x2_matrix,nt_dist)=x ''' add filter here ''' b=x2_matrix.replace("( ","") b=b.replace(" )","") a=b.split(" ") x2_matrix=[] for y in a: x2_matrix.append(int(y)) t=x2_matrix CV=int(t[0])+int(t[1])+int(t[2])+int(t[3]) MR=(float(t[1])+float(t[3]))/CV if(CV>args.reads or MR<0.05):continue # end of filter if (last_chrom==None) or (chrom==last_chrom and pos-last_position < args.gap): matrix_x2.append(x2_matrix) positions.append(pos) x2s.append(x2) snps.append(snp) last_chrom=chrom last_position=pos continue yield last_chrom,positions,x2s,matrix_x2,snps positions=[] x2s=[] matrix_x2=[] snps=[] matrix_x2.append(x2_matrix) x2s.append(x2) positions.append(pos) snps.append(snp) last_chrom=chrom last_position=pos yield last_chrom,positions,x2s,matrix_x2,snps
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() fin = IO.fopen(args.input, "r") out = IO.fopen(args.output, "w") ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) dbi = DBI.init(args.db, Tools.guess_format(args.db)) references = dbi.bamfiles[0].references for i in TableIO.parse(fin, args.format): print i n = 0 c_count = 0 reads = dbi.query(i, args.method) for read in reads: compatible = Tools.compatible_with_transcript( read, i, references=references, strand=args.strand) print "HT:" for i0, r in enumerate( TableIO.parse(read.reads, "bam2bed12", references=references)): print "READ" + str(i0) + "\t", r print "COMPATIBLE:", compatible, "\n\n" if compatible: c_count += 1 n += 1 print "COMPATIBLE / ALL OVERLAP READS = ", c_count, "/", n print "RATIO\t%.4f" % float(c_count) / n
def Main(): ''' This program is a test for TableIO.parse(file.bam,"bam2bed") ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") if args.format=="guess": args.format=Tools.guess_format(args.input) s=TableIO.parse(args.input,args.format) for i in s: print >>out,i
def test(): if len(sys.argv)==1: print >>sys.stderr,"Usage: Utils.py file.bed" exit() a=TableIO.parse(sys.argv[1],'genebed') data=readIntoBinIndex(a) bed=Bed( ["chr12", 54380000, 54392000, "HOXC", 0, "+"] ) g=getOverlapFeatures(bed,data) Overlap_dict = Classify_Overlap(bed, g) overlap_string = '' for k, v in Overlap_dict.iteritems(): if v: overlap_string += "".join([ str(k+'_'+each)+';' for each in v]) print bed, overlap_string
def test(): if len(sys.argv) == 1: print >> sys.stderr, "Usage: Utils.py file.bed" exit() a = TableIO.parse(sys.argv[1], 'genebed') data = readIntoBinIndex(a) bed = Bed(["chr12", 54380000, 54392000, "HOXC", 0, "+"]) g = getOverlapFeatures(bed, data) Overlap_dict = Classify_Overlap(bed, g) overlap_string = '' for k, v in Overlap_dict.iteritems(): if v: overlap_string += "".join( [str(k + '_' + each) + ';' for each in v]) print bed, overlap_string
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) enhancer_dbi=DBI.init(args.enhancer_tabix,"tabix",tabix="metabed",header=re.sub(".gz$",".header",args.enhancer_tabix)) promoter_dbi=DBI.init(args.promoter_tabix,"tabix",tabix="metabed",header=re.sub(".gz$",".header",args.promoter_tabix)) for i in TableIO.parse(fin,args.format): tss=i.tss() tss.start-=args.size tss.stop+=args.size if tss.start<0: tss.start=0 tss.id+="_near"+str(args.size) print "QR\t",tss for e in enhancer_dbi.query(tss): print "EH\t",e for p in promoter_dbi.query(tss): print "PM\t",p
def Main(): args=ParseArg() if len(args.data)!=len(args.name): print >> sys.stderr, "ERROR: Number of data is not the same as number of names!" sys.exit(0) # store data information data={} total_reads={} for i in range(len(args.data)): temp_name=args.name[i] print >> sys.stderr, "\n Reading data file:"+temp_name+"..." total_reads[temp_name]=0 if args.format[i]=="bam": total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i])]) else: Format="bed" for b in TableIO.parse(args.data[i],Format): total_reads[temp_name]+=1 if total_reads[temp_name]%50000==0: print >> sys.stderr, " reading %d reads..\r"%(total_reads[temp_name]), data[temp_name]=DBI.init(args.data[i],args.format[i]) output=open(args.output,'w') Input=open(args.input,'r') lines=Input.read().split("\n") # header header=["chr","start","end","type","name","subtype","count"]+data.keys() print >> output, "\t".join(g+"_%d"%(f) for f in [1,2] for g in header)+"\tinteraction\tp-value" num=0 print >> sys.stderr, "Start process interactions:" for l in lines: if l.strip()=='': continue l=l.strip().split('\t') num=num+1 if l[0]=="chrM" or l[7]=="chrM": continue C1=Bed([l[0],int(l[1]),int(l[2])]) C2=Bed([l[7],int(l[8]),int(l[9])]) rpkm1="\t".join (str(f) for f in [RPKM(C1,data[n],total_reads[n],n) for n in data.keys()]) rpkm2="\t".join (str(f) for f in [RPKM(C2,data[n],total_reads[n],n) for n in data.keys()]) print >> output, "\t".join(str(f) for f in l[:7]+[rpkm1]+l[7:14]+[rpkm2,l[14],l[15]]) if num%1000==0: print >> sys.stderr, " Output interaction: %d\r"%(num),
def __init__(self,bamfiles,**dict): ''' ''' if type(bamfiles)==type("string"): filename=bamfiles bamfiles=[] for i in TableIO.parse(filename,"simple"): bamfiles.append(i[0]) self.bamfiles=[] for bamfile in bamfiles: if type(bamfile)==type("str"): try: bamfile=pysam.Samfile(bamfile,"rb") except: print >>sys.stderr,"WARNING: Can't init the bam file",bamfile self.bamfiles.append(bamfile)
def ParseInput(fin): flag=0 for x in TableIO.parse(fin): if x[0]=="VCF": if flag==1: yield (vcf,hm,hits) vcf=VCF(x[1:]) hm=[] hits=[] flag=1 elif x[0]=="HM ": hm=x[1:] elif x[0]=="DIS ": hits=x[1:] yield (vcf,hm,hits)
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout extend=args.extend f=open(args.input) for bed in TableIO.parse(f,"bed"): bed.start-=extend bed.stop+=extend bed.id+=str("_extend_"+str(extend)+"bp") print >>out,bed
def parse(fin): flag=0 for x in TableIO.parse(fin): if x[0]=="QR": if flag==1: yield (vcf,sorted(HM.keys()),hits) vcf=VCF(x[1:]) HM={} hits={} flag=1 elif x[0]=="HT": hit=eval(x[1]) hits[hm]=hit else: (cell,hm)=x[0].split("_") if not HM.has_key(hm): HM[hm]=1 yield (vcf,sorted(HM.keys()),hits)
def Main(): ''' This program is a test for TableIO.parse(file.bam,"bam2bed") ''' global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout s = TableIO.parse(args.input, "bam2bed") for i in s: print >> out, i
def Main(): global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout extend = args.extend f = open(args.input) for bed in TableIO.parse(f, "bed"): bed.start -= extend bed.stop += extend bed.id += str("_extend_" + str(extend) + "bp") print >> out, bed
def Main(): args = ParseArg() #store bed files with indexing and count information: bed = {} print >> sys.stderr, "Starting index bed files:" for i in range(len(args.beds)): temp_name = args.name[i] print >> sys.stderr, " #Indexing for bed file of", temp_name, "\r", bed[temp_name] = DBI.init(args.beds[i], 'bed') half_len = int(args.len) print >> sys.stderr print >> sys.stderr, "Reading nucleosome peak xls file from Danpos." nucleosomes = TableIO.parse(args.nucleosome, 'metabed', header=True) print >> sys.stderr, "Start Counting..." count_matrix = [] out = open(args.output, "w") line_head = open(args.nucleosome, 'r').readline().strip() line_head = line_head + "\t" + "\t".join(str(f) for f in args.name) print >> out, line_head for i in nucleosomes: chrom = i.chr if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM': continue center = int(i.start + i.end) / 2 count = np.zeros(len(args.beds), dtype="float") line = str(i) for k, name in enumerate(bed.keys()): for j in bed[name].query( Bed([ chrom, center - ma - (half_len - 75), center + ma + (half_len - 75) ])): j_center = find_center(j, half_len) weight = max(min(1, (ma - abs(j_center - center)) / 25.0), 0) count[k] += weight line = line + "\t" + "\t".join(str(f) for f in count) print >> out, line count_matrix.append(count)
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout if args.input == "stdin": fin = sys.stdin else: try: x = args.input.split(".") if x[-1] == "gz": fin = gzip.open(args.input, "r") else: fin = open(args.input, "r") except IOError: print >> sys.stderr, "can't read file", args.input fin = sys.stdin ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) genome = GenomeI(args.genome) for i in TableIO.parse(fin, args.format): print >> out, ">" + i.id + "_cDNA" if args.line: print >> out, genome.get_cdna_seq(i) else: print >> out, seq_wrapper(genome.get_cdna_seq(i))
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout argv=sys.argv argv[0]=argv[0].split("/")[-1] print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :\n#\t"," ".join(argv) dbi=DBI.init(args.db,args.dbformat) hits=0 query=0 if args.input=="stdin": input=sys.stdin else: input=args.input query_length=0 hits_number=0 query_sets=[] for x in TableIO.parse(input,args.input_format): query_sets.append(x) length=len(query_sets) size=length/args.thread #results=[[] for i in range(args.thread)] for i in range(args.thread): end= (i+1)*size if (i+1)*size < length else length end= end if (i+1)!=args.thread else length querys=query_sets[i*size:end] # print i*size,end # for j in querys: print j thread.start_new_thread(fquery,(querys,dbi,i))
def __init__(self,handle=None,**kwargs): self.data={} if handle is not None: self.read(handle) ''' init from file Example: data=binindex(file="file.bed") or data=binindex(file="file.vcf",format="vcf") ''' if kwargs.has_key("format"): format=kwargs["format"] del kwargs["format"] else: format="bed" if kwargs.has_key("file"): f=kwargs["file"] del kwargs["file"] self.read(TableIO.parse(f,format,**kwargs))
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() fin = IO.fopen(args.input, "r") out = IO.fopen(args.output, "w") ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) for i in TableIO.parse(fin, args.format): print >> out, i