def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout count={} dbi1=DBI.init(args.db,"bed") # the DBI init file for bed6 file of all kinds of RNA dbi2=DBI.init(args.db_detail,"bed") # the DBI init file for bed12 file of lincRNA and mRNA with intron, exon, UTR genome=Genome('mouse', Release=67, account=None) for bed in TableIO.parse(args.input,args.format): [typ,name,subtype]=annotation(bed,dbi1,dbi2,genome) if count.has_key(typ): count[typ]+=1 else: count[typ]=1 print >>out, "\t".join (str(f) for f in [bed.chr,bed.start,bed.stop,bed.id,name,bed.strand,typ, subtype]) print >>out, "\n".join ("#"+typ+"\t%d"%(count[typ]) for typ in count.keys())
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout if args.input == "stdin": fin = sys.stdin else: try: x = args.input.split(".") if x[-1] == "gz": fin = gzip.open(args.input, "r") else: fin = open(args.input, "r") except IOError: print >> sys.stderr, "can't read file", args.input fin = sys.stdin ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) enhancer_dbi = DBI.init(args.enhancer_tabix, "tabix", tabix="metabed", header=re.sub(".gz$", ".header", args.enhancer_tabix)) promoter_dbi = DBI.init(args.promoter_tabix, "tabix", tabix="metabed", header=re.sub(".gz$", ".header", args.promoter_tabix)) for i in TableIO.parse(fin, args.format): tss = i.tss() tss.start -= args.size tss.stop += args.size if tss.start < 0: tss.start = 0 tss.id += "_near" + str(args.size) print "QR\t", tss for e in enhancer_dbi.query(tss): print "EH\t", e for p in promoter_dbi.query(tss): print "PM\t", p
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) enhancer_dbi=DBI.init(args.enhancer_tabix,"tabix",tabix="metabed",header=re.sub(".gz$",".header",args.enhancer_tabix)) promoter_dbi=DBI.init(args.promoter_tabix,"tabix",tabix="metabed",header=re.sub(".gz$",".header",args.promoter_tabix)) for i in TableIO.parse(fin,args.format): tss=i.tss() tss.start-=args.size tss.stop+=args.size if tss.start<0: tss.start=0 tss.id+="_near"+str(args.size) print "QR\t",tss for e in enhancer_dbi.query(tss): print "EH\t",e for p in promoter_dbi.query(tss): print "PM\t",p
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) dbi=DBI.init(args.db,Tools.guess_format(args.db)) references=dbi.bamfiles[0].references for i in TableIO.parse(fin,args.format): print i n=0 c_count=0 reads=dbi.query(i,args.method) for read in reads: compatible=Tools.compatible_with_transcript(read,i,references=references,strand=args.strand) print "HT:" for i0,r in enumerate(TableIO.parse(read.reads,"bam2bed12",references=references)): print "READ"+str(i0)+"\t",r print "COMPATIBLE:",compatible,"\n\n" if compatible: c_count+=1 n+=1 print "COMPATIBLE / ALL OVERLAP READS = ",c_count,"/",n print "RATIO\t%.4f"%float(c_count)/n
def Main(): global args,out CellLine=["H1"] HM=("input","H3K27ac","H3K27me3","H3K36me3","H3K4me1","H3K4me3","H3K9me3") marks=[] dbi={} args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout for cell in CellLine: for hm in HM: mark=cell+"_"+hm marks.append(mark) dbi[mark]=DBI.init("/data/zhuxp/bam2x/data/bamlist/"+mark+".bamlist","bamlist") for i,x in enumerate(TableIO.parse(args.input,args.input_format)): print >>out,"QR\t",x if i%100==0: print >>sys.stderr,"query %d entries\r"%i, for mark in marks: print >>out,mark,"\t" for j in DBI.query(x,dbi[mark]): print >>out,"HT\t",j
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) dbi=[]; for i,bam in enumerate(args.bams): print >>out,"# SAMPLE_"+str(i+1)+" BAM File:",bam dbi.append(DBI.init(bam,"bam")) print >>out,"#",VCF.header(), for i,bam in enumerate(args.bams): print >>out,"\t","Sample_"+str(i+1), print >>out,"" for i,vcf in enumerate(TableIO.parse(fin,"vcf")): vcf.chr=args.chr_prefix+vcf.chr if(i%100==0): print >>sys.stderr,"processing",i,"vcf\r", print >>out,vcf, for d in dbi: print >>out,"\t", for r in d.query(vcf): print >>out,format(r), print >>out,""
def Main(): args = ParseArg() if len(args.data) != len(args.name): print >> sys.stderr, "ERROR: Number of data is not the same as number of names!" sys.exit(0) # store data information data = {} total_reads = {} for i in range(len(args.data)): temp_name = args.name[i] print >> sys.stderr, "\n Reading data file:" + temp_name + "..." total_reads[temp_name] = 0 if args.format[i] == "bam": total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i]) ]) else: Format = "bed" for b in TableIO.parse(args.data[i], Format): total_reads[temp_name] += 1 if total_reads[temp_name] % 50000 == 0: print >> sys.stderr, " reading %d reads..\r" % ( total_reads[temp_name]), data[temp_name] = DBI.init(args.data[i], args.format[i]) output = open(args.output, 'w') Input = open(args.input, 'r') lines = Input.read().split("\n") # header header = ["chr", "start", "end", "type", "name", "subtype", "count" ] + data.keys() print >> output, "\t".join(g + "_%d" % (f) for f in [1, 2] for g in header) + "\tinteraction\tp-value" num = 0 print >> sys.stderr, "Start process interactions:" for l in lines: if l.strip() == '': continue l = l.strip().split('\t') num = num + 1 if l[0] == "chrM" or l[7] == "chrM": continue C1 = Bed([l[0], int(l[1]), int(l[2])]) C2 = Bed([l[7], int(l[8]), int(l[9])]) rpkm1 = "\t".join( str(f) for f in [RPKM(C1, data[n], total_reads[n], n) for n in data.keys()]) rpkm2 = "\t".join( str(f) for f in [RPKM(C2, data[n], total_reads[n], n) for n in data.keys()]) print >> output, "\t".join( str(f) for f in l[:7] + [rpkm1] + l[7:14] + [rpkm2, l[14], l[15]]) if num % 1000 == 0: print >> sys.stderr, " Output interaction: %d\r" % (num),
def Main(): global args args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout dbi=DBI.init(args.db,"genebed") count={} count["Intergenic"]=0 for x in TableIO.parse(args.input,args.input_format): flag=0 gene="" for hit in dbi.query(x): flag=1 if hit.align_id==gene: continue gene=hit.align_id #print hit #print hit.cds_start,hit.cds_stop if (hit.cds_start==hit.cds_stop): if hit.align_id[0:3]=="Mir": loc="MiRNA" else: loc="Non-coding" elif hit.strand=="+": if x.stop<=hit.cds_start: loc="5-UTR" elif x.start>=hit.cds_stop: loc="3-UTR" else: loc=judge_exon(x,hit) else: if x.stop<=hit.cds_start: loc="3-UTR" elif x.start>=hit.cds_stop: loc="5-UTR" else: loc=judge_exon(x,hit) print >>out,"\t".join (str(f) for f in [x.chr,x.start,x.stop,x.id,x.score,x.strand,hit.align_id,loc]) if count.has_key(loc): count[loc]+=1 else: count[loc]=1 if flag==0: print >>out, "\t".join (str(f) for f in [x.chr,x.start,x.stop,x.id,x.score,x.strand,"None","Intergenic"]) count["Intergenic"]+=1 out2=open(args.output.split(".")[0]+".cisStat","w") for key in sorted(count.keys()): print >>out2,key+"\t"+str(count[key])
def ReadHistones(fhist_name): sp1 = [] sp2 = [] with open(fhist_name, "r") as fhist: fhist.readline() while True: line = fhist.readline().strip() if "@" in line: break line = line.split() sp1.append(DBI.init(line[0], "bed")) while True: line = fhist.readline().strip() if line == "": break line = line.split() sp2.append(DBI.init(line[0], "bed")) return sp1, sp2
def Main(): args=ParseArg() if len(args.data)!=len(args.name): print >> sys.stderr, "ERROR: Number of data is not the same as number of names!" sys.exit(0) # store data information data={} total_reads={} for i in range(len(args.data)): temp_name=args.name[i] print >> sys.stderr, "\n Reading data file:"+temp_name+"..." total_reads[temp_name]=0 if args.format[i]=="bam": total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i])]) else: Format="bed" for b in TableIO.parse(args.data[i],Format): total_reads[temp_name]+=1 if total_reads[temp_name]%50000==0: print >> sys.stderr, " reading %d reads..\r"%(total_reads[temp_name]), data[temp_name]=DBI.init(args.data[i],args.format[i]) output=open(args.output,'w') Input=open(args.input,'r') lines=Input.read().split("\n") # header header=["chr","start","end","type","name","subtype","count"]+data.keys() print >> output, "\t".join(g+"_%d"%(f) for f in [1,2] for g in header)+"\tinteraction\tp-value" num=0 print >> sys.stderr, "Start process interactions:" for l in lines: if l.strip()=='': continue l=l.strip().split('\t') num=num+1 if l[0]=="chrM" or l[7]=="chrM": continue C1=Bed([l[0],int(l[1]),int(l[2])]) C2=Bed([l[7],int(l[8]),int(l[9])]) rpkm1="\t".join (str(f) for f in [RPKM(C1,data[n],total_reads[n],n) for n in data.keys()]) rpkm2="\t".join (str(f) for f in [RPKM(C2,data[n],total_reads[n],n) for n in data.keys()]) print >> output, "\t".join(str(f) for f in l[:7]+[rpkm1]+l[7:14]+[rpkm2,l[14],l[15]]) if num%1000==0: print >> sys.stderr, " Output interaction: %d\r"%(num),
def Main(): args = ParseArg() #store bed files with indexing and count information: bed = {} print >> sys.stderr, "Starting index bed files:" for i in range(len(args.beds)): temp_name = args.name[i] print >> sys.stderr, " #Indexing for bed file of", temp_name, "\r", bed[temp_name] = DBI.init(args.beds[i], 'bed') half_len = int(args.len) print >> sys.stderr print >> sys.stderr, "Reading nucleosome peak xls file from Danpos." nucleosomes = TableIO.parse(args.nucleosome, 'metabed', header=True) print >> sys.stderr, "Start Counting..." count_matrix = [] out = open(args.output, "w") line_head = open(args.nucleosome, 'r').readline().strip() line_head = line_head + "\t" + "\t".join(str(f) for f in args.name) print >> out, line_head for i in nucleosomes: chrom = i.chr if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM': continue center = int(i.start + i.end) / 2 count = np.zeros(len(args.beds), dtype="float") line = str(i) for k, name in enumerate(bed.keys()): for j in bed[name].query( Bed([ chrom, center - ma - (half_len - 75), center + ma + (half_len - 75) ])): j_center = find_center(j, half_len) weight = max(min(1, (ma - abs(j_center - center)) / 25.0), 0) count[k] += weight line = line + "\t" + "\t".join(str(f) for f in count) print >> out, line count_matrix.append(count)
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout dbi=DBI.init(args.db,args.dbformat) for x in TableIO.parse(args.input,args.input_format): print "QR\t",x for j in DBI.query(x,dbi): print "HT\t",j x.chr=x.chr.replace("chr","") for j in DBI.query(x,dbi): print "HT\t",j
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout argv=sys.argv argv[0]=argv[0].split("/")[-1] print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :\n#\t"," ".join(argv) dbi=DBI.init(args.db,args.dbformat) hits=0 query=0 if args.input=="stdin": input=sys.stdin else: input=args.input query_length=0 hits_number=0 query_sets=[] for x in TableIO.parse(input,args.input_format): query_sets.append(x) length=len(query_sets) size=length/args.thread #results=[[] for i in range(args.thread)] for i in range(args.thread): end= (i+1)*size if (i+1)*size < length else length end= end if (i+1)!=args.thread else length querys=query_sets[i*size:end] # print i*size,end # for j in querys: print j thread.start_new_thread(fquery,(querys,dbi,i))
def Main(): args=ParseArg() #store bed files with indexing and count information: bed={} print >>sys.stderr,"Starting index bed files:" for i in range(len(args.beds)): temp_name=args.name[i] print >>sys.stderr," #Indexing for bed file of",temp_name,"\r", bed[temp_name]=DBI.init(args.beds[i],'bed') half_len=int(args.len) print >>sys.stderr print >>sys.stderr,"Reading nucleosome peak xls file from Danpos." nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True) print >>sys.stderr,"Start Counting..." count_matrix=[] out=open(args.output,"w") line_head=open(args.nucleosome,'r').readline().strip() line_head=line_head+"\t"+"\t".join(str(f) for f in args.name) print >>out,line_head for i in nucleosomes: chrom=i.chr if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM': continue center=int(i.start+i.end)/2 count=np.zeros(len(args.beds),dtype="float") line=str(i) for k,name in enumerate(bed.keys()): for j in bed[name].query(Bed([chrom,center-ma-(half_len-75),center+ma+(half_len-75)])): j_center=find_center(j,half_len) weight = max(min(1,(ma-abs(j_center-center))/25.0),0) count[k]+=weight line = line + "\t" + "\t".join(str(f) for f in count) print >>out,line count_matrix.append(count)
def Main(): args = ParseArg() anno = DBI.init(args.annotation, "bed") ext_dis = args.ext_dis target_num = args.target_num with open(args.input, "r") as fin, open(args.output, "w") as fout: for line in fin: bed_region = Bed(line.strip().split()) mid_point = (bed_region.start + bed_region.stop) / 2 ori_start = bed_region.start ori_stop = bed_region.stop bed_region.start = mid_point - ext_dis bed_region.stop = mid_point + ext_dis gene_list = findNearbyGene(bed_region, anno, ori_start, ori_stop, target_num) for gene in gene_list: print >> fout, "\t".join( [bed_region.id, gene[1], str(gene[0])])
def Main(): ''' IO TEMPLATE ''' global args, out args = ParseArg() fin = IO.fopen(args.input, "r") out = IO.fopen(args.output, "w") ''' END OF IO TEMPLATE ''' print >> out, "# This data was generated by program ", sys.argv[ 0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) dbi = DBI.init(args.db, Tools.guess_format(args.db)) references = dbi.bamfiles[0].references for i in TableIO.parse(fin, args.format): print i n = 0 c_count = 0 reads = dbi.query(i, args.method) for read in reads: compatible = Tools.compatible_with_transcript( read, i, references=references, strand=args.strand) print "HT:" for i0, r in enumerate( TableIO.parse(read.reads, "bam2bed12", references=references)): print "READ" + str(i0) + "\t", r print "COMPATIBLE:", compatible, "\n\n" if compatible: c_count += 1 n += 1 print "COMPATIBLE / ALL OVERLAP READS = ", c_count, "/", n print "RATIO\t%.4f" % float(c_count) / n
def Main(): global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout argv = sys.argv argv[0] = argv[0].split("/")[-1] print >> out, "# This data was generated by program ", argv[ 0], "(version %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :\n#\t", " ".join(argv) dbi = DBI.init(args.bam, "bam") if args.input == "stdin": input = sys.stdin else: input = args.input for x in TableIO.parse(input, args.input_format): promoter = x.core_promoter(1000, 1000) print >> out, x print >> out, promoter retv = [] for (i, r) in enumerate(dbi.query(promoter)): retv.append(sum(r)) if x.strand == "-": retv = retv[::-1] for i in retv: print >> out, i, print >> out, ""
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout argv=sys.argv argv[0]=argv[0].split("/")[-1] print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :\n#\t"," ".join(argv) dbi=DBI.init(args.bam,"bam") if args.input=="stdin": input=sys.stdin else: input=args.input for x in TableIO.parse(input,args.input_format): promoter=x.core_promoter(1000,1000) print >>out,x print >>out,promoter retv=[] for (i,r) in enumerate(dbi.query(promoter)): retv.append(sum(r)) if x.strand=="-": retv=retv[::-1] for i in retv: print >>out,i, print >>out,""
def Main(): t1 = time() args = ParseArg() inp = open(args.input, 'r') min_clusterS = args.min_clusterS min_interaction = args.min_interaction p_value = args.p_value output = open(args.output, 'w') outputIntra = open(args.output_intra, 'w') hasAnnotation = False if args.annotation: dbi = DBI.init(args.annotation, "bed") hasAnnotation = True else: dbi = False if args.annotation_repeat: dbirepeat = DBI.init(args.annotation_repeat, "bed") hasAnnotationRepeat = True else: dbirepeat = False #store count of RNA for part1 and part2 part = {} k = 0 sgcount = 0 #single fragment count print >> sys.stderr, "# Inputing data..." interaction = {} # store number of interactions for different RNA selfinteraction = {} #Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript","sense_overlapping","rRNA_repeat","rRNA"] for line in inp.read().split('\n'): if line == '': continue line = line.strip().split('\t') p1 = annotated_bed_proper(line[0:10], id=k, cluster=1) p2 = annotated_bed_proper(line[11:], id=k, cluster=1) if isinstance(p1.start, list): p1.start = int(p1.start[0]) p1.end = int(p1.end[-1]) if isinstance(p2.start, list): p2.start = int(p2.start[0]) p2.end = int(p2.end[-1]) if SingleFragment(p1, p2): sgcount += 1 continue k += 1 #if p1.subtype=="intron" or p2.subtype=="intron": continue #if p1.type in Types: try: p1_name = GetAnnotationName(p1, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) if p1_name not in part: part[p1_name] = 1 else: part[p1_name] += 1 #if p2.type in Types: p2_name = GetAnnotationName(p2, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) if not p1_name == p2_name: # count once for self-interaction if p2_name not in part: part[p2_name] = 1 else: part[p2_name] += 1 #if p1.type in Types and p2.type in Types: if p1_name == p2_name: if p1_name not in selfinteraction: selfinteraction[p1_name] = copy.deepcopy(p1) else: selfinteraction[p1_name].Update(p1.start, p1.end) selfinteraction[p1_name].Update(p2.start, p2.end) selfinteraction[p1_name].cluster += 1 else: if p1_name > p2_name: temp = p1 p1 = p2 p2 = temp tempName = p1_name p1_name = p2_name p2_name = tempName inter_name = p1_name + "--" + p2_name if inter_name not in interaction: interaction[inter_name] = [ copy.deepcopy(p1), copy.deepcopy(p2) ] else: interaction[inter_name][0].Update(p1.start, p1.end) interaction[inter_name][1].Update(p2.start, p2.end) interaction[inter_name][0].cluster += 1 except Exception as e: print >> sys.stderr, e if k % 20000 == 0: print >> sys.stderr, " Reading %d pairs of segments\r" % (k), print >> sys.stdout, "Get total %d pairs." % (k) print >> sys.stdout, "Single fragment count: %d." % (sgcount) print >> sys.stdout, " number of different RNAs is %d " % ( len(part)) total = k # total pairs used n = 0 k = 0 # record number of strong interactions for i in interaction: n += 1 count = interaction[i][0].cluster if count < min_interaction: continue p1_name = i.split("--")[0] p2_name = i.split("--")[1] P1 = interaction[i][0] P2 = interaction[i][1] P1.cluster = part[p1_name] P2.cluster = part[p2_name] if part[p1_name] < min_clusterS or part[p2_name] < min_clusterS: continue real_p = 1 - hypergeom.cdf(count, total, part[p1_name], part[p2_name]) if real_p <= p_value: k = k + 1 try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output, str(P1) + '\t' + str(P2) + '\t%d\t%.4f' % (count, log_p) if n % 500 == 0: print >> sys.stderr, " Progress ( %d / %d )\r" % ( n, len(interaction)), k1 = 0 for i in selfinteraction: n += 1 count = selfinteraction[i].cluster if count < min_interaction: continue p1_name = i P1 = selfinteraction[i] P1.cluster = part[p1_name] if part[p1_name] < min_clusterS: continue k1 = k1 + 1 print >> outputIntra, str(P1) + '\t%d' % (count) if n % 500 == 0: print >> sys.stderr, " Progress ( %d / %d )\r" % ( n, len(interaction)), print >> sys.stdout, "# Find %d strong and %d self interactions. Cost time: %.2f s" % ( k, k1, time() - t1)
def genome_annotation(outputbam, annotationfile, detail, readfilename, unmapfilename, strandenforced = False, posstrand = True, requireUnique = False, results_dict = dict()): # annotationfile is annotation file # detail is db_detail file if annotationfile: dbi1=DBI.init(annotationfile,"bed") dbi2=DBI.init(detail,"bed") dbi3=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed") newdict = dict() funmap = open(unmapfilename, 'w') for record in outputbam: # print >> sys.stderr, record.qname IsMapped = False if Included(record, requireUnique): strandactual = ("+" if posstrand else "-") strand = "+" if record.is_reverse: strandactual = ("-" if posstrand else "+") strand = "-" if annotationfile: bed=Bed([outputbam.getrname(record.tid), record.pos, record.aend,'.',0.0,strandactual]) [typ, name, subtype, strandcol] = annotation(bed,dbi1,dbi2,dbi3) if (not strandenforced) or strandcol == 'ProperStrand': curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.pos, record.aend, strand, record.seq, 'genome', typ, name, subtype, strandcol]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True else: strandcol = '.' curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.aend - record.alen + 1, record.aend, strand, record.seq, 'genome', strandcol]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True if not IsMapped: # output all pairs that cannot be mapped on both sides as unmaped pairs into two fasta file seq = record.seq if record.is_reverse: seq = revcomp(record.seq, rev_table) unmap_rec = SeqRecord(Seq(seq, IUPAC.unambiguous_dna), id = record.qname, description='') SeqIO.write(unmap_rec, funmap, "fasta") funmap.close() newanno = dict(results_dict.items() + newdict.items()) return newanno
def Main(): global args,out args=ParseArg() dict={} if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout argv=sys.argv argv[0]=argv[0].split("/")[-1] print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :\n#\t"," ".join(argv) if args.query_method: dict["method"]=args.query_method dbi=DBI.init(args.db,args.dbformat) hits=0 query=0 if args.input=="stdin": input=sys.stdin else: input=args.input query_length=0 hits_number=0 for (i0,x) in enumerate(TableIO.parse(input,args.input_format)): if i0%10==0: print >>sys.stderr,"query ",i0," entries\r", print >>out,"QR\t",x hit=0 query+=1 query_length+=len(x) results=dbi.query(x,**dict) compatible=0 #print >>sys.stderr,type(results) if isinstance(results,numpy.ndarray) or isinstance(results,list): if not args.silence: print >>out,"HT\t", for value in results: print >>out,str(value)+",", print >>out,"" hit=1 hits_number+=1 elif isinstance(results,str): if not args.silence: print >>out,"HT\t", print >>out,results hit=1 hits_number+=1 else: this_query_hits=0 for j in results: if not args.silence: print >>out,"HT\t",j, hit=1 hits_number+=1 this_query_hits+=1 if isinstance(j,xplib.Annotation.Bed12) and isinstance(x,xplib.Annotation.Bed12): compatible_binary=Tools.compatible_with_transcript(j,x) if not args.silence: print >>out,"\tCompatible:",compatible_binary if compatible_binary: compatible+=1 else: if not args.silence: print >>out,"" print >>out,"HN\t",this_query_hits if compatible>0: print >>out,"CP\t",compatible if args.dbformat=="tabix": x.chr=x.chr.replace("chr","") for j in dbi.query(x,**dict): print >>out,"HT\t",j hit=1 hits_number+=1 hits+=hit print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits print >>out,"# Query Length:",query_length print >>out,"# Hits Number:",hits_number
def Main(): global args args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout dbi = DBI.init(args.db, "genebed") count = {} count["Intergenic"] = 0 for x in TableIO.parse(args.input, args.input_format): flag = 0 gene = "" for hit in dbi.query(x): flag = 1 if hit.align_id == gene: continue gene = hit.align_id #print hit #print hit.cds_start,hit.cds_stop if (hit.cds_start == hit.cds_stop): if hit.align_id[0:3] == "Mir": loc = "MiRNA" else: loc = "Non-coding" elif hit.strand == "+": if x.stop <= hit.cds_start: loc = "5-UTR" elif x.start >= hit.cds_stop: loc = "3-UTR" else: loc = judge_exon(x, hit) else: if x.stop <= hit.cds_start: loc = "3-UTR" elif x.start >= hit.cds_stop: loc = "5-UTR" else: loc = judge_exon(x, hit) print >> out, "\t".join( str(f) for f in [ x.chr, x.start, x.stop, x.id, x.score, x.strand, hit.align_id, loc ]) if count.has_key(loc): count[loc] += 1 else: count[loc] = 1 if flag == 0: print >> out, "\t".join( str(f) for f in [ x.chr, x.start, x.stop, x.id, x.score, x.strand, "None", "Intergenic" ]) count["Intergenic"] += 1 out2 = open(args.output.split(".")[0] + ".cisStat", "w") for key in sorted(count.keys()): print >> out2, key + "\t" + str(count[key])
s: start column number for second part of interaction ''' a=open(File,'r') for l in a.read().split('\n'): if l.strip()=="": continue lsep=l.split('\t') if lsep[3] in ['+','-']: bed1=Bed(lsep[0:3],strand=lsep[3]) bed2=Bed(lsep[s:(s+3)],strand=lsep[s+3]) else: bed1=Bed(lsep[0:3]) bed2=Bed(lsep[s:(s+3)]) yield (bed1,bed2,lsep) # annotation files db="/home/yu68/bharat-interaction/new_lincRNA_data/all_RNAs-rRNA_repeat.txt" db_detail="/home/yu68/bharat-interaction/new_lincRNA_data/Ensembl_mm9.genebed" db_repeat="/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt" print >>sys.stderr, "Indexing annotation files..." ref_allRNA=DBI.init(db,"bed") # the DBI init file for bed6 file of all kinds of RNA ref_detail=DBI.init(db_detail,"bed") # the DBI init file for bed12 file of lincRNA and mRNA with intron, exon, UTR ref_repeat=DBI.init(db_repeat,"bed") print >>sys.stderr, "Start to update..." for l in read_interaction(sys.argv[1],7): l[2][3:6] = annotation(l[0],ref_allRNA,ref_detail,ref_repeat) l[2][10:13] = annotation(l[1],ref_allRNA,ref_detail,ref_repeat) print "\t".join(l[2])
def Main(): args=ParseArg() #store bed files with indexing and count information: bam={} print >>sys.stderr,"Starting index bed files:" for i in range(len(args.bams)): temp_name=args.name[i] print >>sys.stderr," #Indexing for bed file of",temp_name,"\r", bam[temp_name]=DBI.init(args.bams[i],args.fmt) print >>sys.stderr print >>sys.stderr,"Reading nucleosome peak xls file from Danpos." nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True) print >>sys.stderr,"Initial output files..." out=open(args.output,"w") # -- for verbose --- if args.verbose: out_mark=[] for n in args.name: out_mark.append(open(n+'_shift_nucleosomes.bed','w')) # ------------------ line_head=open(args.nucleosome,'r').readline().strip() line_head=line_head+"\t"+"\t".join(str(f) for f in args.name)+'\t'+"\t".join(str(f)+'_off' for f in args.name) print >>out,line_head print >>sys.stderr,"Start Counting..." num=0 t0 = time() for i in nucleosomes: chrom=i.chr if i.smt_pval>0.01 or i.fuzziness_pval>0.01: continue # only choose nucleosomes with high value and low fuzziness if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM': continue num=num+1 center=int(i.start+i.end)/2 count=np.zeros(len(args.bams),dtype="float") offset=np.zeros(len(args.bams),dtype='int') line=str(i) for k,name in enumerate(bam.keys()): if args.fmt=='bam': query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]),method='fetch') else: query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS])) read_centers=[] for j in query: read_centers.append(find_center(j,args.fmt)) [o,c]=getCount(read_centers,center) count[k]=c offset[k]=o # -- for verbose --- if args.verbose: print >>out_mark[k],chrom+'\t%d\t%d'%(i.start+o,i.end+o) # ------------------ line = line + "\t" + "\t".join(str(f) for f in count) + '\t' + "\t".join(str(f) for f in offset) if num%20000==0: t1 = time() print >>sys.stderr,"processing %dth nucleosome..., time: %.2fs."%(num,t1-t0),'\r', t0 = time() print >>out,line print out.close() # -- for verbose --- if args.verbose: for k in out_mark: k.close()
def Main(): """ IO TEMPLATE """ global args, out args = ParseArg() fin = IO.fopen(args.input, "r") out = IO.fopen(args.output, "w") """ END OF IO TEMPLATE """ print >> out, "# This data was generated by program ", sys.argv[0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) hSites = {} donorSites = {} acceptorSites = {} if args.genome is not None: genome = DBI.init(args.genome, "genome") else: genome = None j = 0 for j, i in enumerate(TableIO.parse(fin, "bam2bed12", references=fin.references, strand=args.strand)): # print >>out,i if j % 1000 == 0: print >>sys.stderr, "processing ", j, "reads \r", for intron in i.Introns(): if len(intron) < args.intron_min_length: continue donor = intron.head() # print >>sys.stderr,intron # print >>sys.stderr,donor donorID = bedToID(donor) if donorSites.has_key(donorID): donorSites[donorID] += 1 else: donorSites[donorID] = 1 acceptor = intron.tail() acceptorID = bedToID(acceptor) if acceptorSites.has_key(acceptorID): acceptorSites[acceptorID] += 1 else: acceptorSites[acceptorID] = 1 """ if genome is not None: s=genome.query(intron.head()).upper()+".."+genome.query(intron.tail()).upper() if hSites.has_key(s): hSites[s]+=1 else: hSites[s]=1 """ donors = [] for key in donorSites.keys(): a = key.split("\t") donors.append(Bed([a[0], a[1], a[2], "noname_donor", donorSites[key], a[3]])) donors.sort() for i, x in enumerate(donors): x.id = "donor_" + str(i) print >> out, x, "\t", genome.query(x).upper() acceptors = [] for key in acceptorSites.keys(): a = key.split("\t") acceptors.append(Bed([a[0], a[1], a[2], "noname_acceptor", acceptorSites[key], a[3]])) acceptors.sort() for i, x in enumerate(acceptors): x.id = "acceptor_" + str(i) print >> out, x, "\t", genome.query(x).upper()
def genome_annotation(outputbam, annotationfile, detail, annotationRepeat, mapq_thred, strandenforced=False, posstrand=True, requireUnique=False, results_dict=dict()): # annotationfile is annotation file # detail is db_detail file if annotationfile: dbi1 = DBI.init(annotationfile, "bed") dbi2 = DBI.init(detail, "bed") dbi3 = DBI.init(annotationRepeat, "bed") newdict = dict() # funmap = open(unmapfilename, 'w') for record in outputbam: # print >> sys.stderr, record.qname if "N" not in record.cigarstring: anno_start = record.pos anno_end = record.aend bed_start = record.pos bed_end = record.aend else: bed_list, anno_start, anno_end = Exon_junction(record) bed_start = ",".join([str(f[0]) for f in bed_list]) bed_end = ",".join([str(f[1]) for f in bed_list]) # print anno_start,anno_end,bed_start,bed_end IsMapped = False if Included(record, requireUnique, mapq_thred): strandactual = ("+" if posstrand else "-") strand = "+" if record.is_reverse: strandactual = ("-" if posstrand else "+") strand = "-" if annotationfile: bed = Bed([ outputbam.getrname(record.tid), anno_start, anno_end, '.', 0.0, strandactual ]) [typ, name, subtype, strandcol] = annotation(bed, dbi1, dbi2, dbi3) if (not strandenforced) or strandcol == 'ProperStrand': curr_anno_arr = (str(f) for f in [ outputbam.getrname( record.tid), bed_start, bed_end, strand, record.seq, 'genome', typ, name, subtype, strandcol ]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True, mapq_thred): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True else: strandcol = '.' curr_anno_arr = (str(f) for f in [ outputbam.getrname(record.tid), record.aend - record.alen + 1, record.aend, strand, record.seq, 'genome', strandcol ]) if not record.qname in newdict: newdict[record.qname] = '\t'.join(curr_anno_arr) if not Included(record, True, mapq_thred): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append('\t'.join(curr_anno_arr)) IsMapped = True newanno = dict(results_dict.items() + newdict.items()) return newanno
def Main(): ''' IO TEMPLATE ''' global args,out,isoforms_set,selected_isoforms_set,reads_set,selected_reads_set,dbi args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) dbi=DBI.init(args.bam,args.format) ''' reading all the isoforms ''' isoforms=[] iterator=TableIO.parse(fin,"bed") beds=[] for i in iterator: beds.append(i) beds.sort() if len(beds)==0: print >>sys.stderr,"error in reading file",args.input exit(1) bed=beds[0] chr=bed.chr min_start=bed.start max_stop=bed.stop j=0 for i in beds: j+=1 if (j%10==0): print >>sys.stderr,"processed %d entries\r"%j, if Tools.overlap(bed,i): if bed.stop < i.stop: bed.stop=i.stop isoforms.append(i) else: compare(isoforms) isoforms=[i] bed=i if len(isoforms)>0: compare(isoforms)
def genome_annotation( outputbam, annotationfile, detail, annotationRepeat, mapq_thred, strandenforced=False, posstrand=True, requireUnique=False, results_dict=dict(), ): # annotationfile is annotation file # detail is db_detail file if annotationfile: dbi1 = DBI.init(annotationfile, "bed") dbi2 = DBI.init(detail, "bed") dbi3 = DBI.init(annotationRepeat, "bed") newdict = dict() # funmap = open(unmapfilename, 'w') for record in outputbam: # print >> sys.stderr, record.qname if "N" not in record.cigarstring: anno_start = record.pos anno_end = record.aend bed_start = record.pos bed_end = record.aend else: bed_list, anno_start, anno_end = Exon_junction(record) bed_start = ",".join([str(f[0]) for f in bed_list]) bed_end = ",".join([str(f[1]) for f in bed_list]) # print anno_start,anno_end,bed_start,bed_end IsMapped = False if Included(record, requireUnique, mapq_thred): strandactual = "+" if posstrand else "-" strand = "+" if record.is_reverse: strandactual = "-" if posstrand else "+" strand = "-" if annotationfile: bed = Bed([outputbam.getrname(record.tid), anno_start, anno_end, ".", 0.0, strandactual]) [typ, name, subtype, strandcol] = annotation(bed, dbi1, dbi2, dbi3) if (not strandenforced) or strandcol == "ProperStrand": curr_anno_arr = ( str(f) for f in [ outputbam.getrname(record.tid), bed_start, bed_end, strand, record.seq, "genome", typ, name, subtype, strandcol, ] ) if not record.qname in newdict: newdict[record.qname] = "\t".join(curr_anno_arr) if not Included(record, True, mapq_thred): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append("\t".join(curr_anno_arr)) IsMapped = True else: strandcol = "." curr_anno_arr = ( str(f) for f in [ outputbam.getrname(record.tid), record.aend - record.alen + 1, record.aend, strand, record.seq, "genome", strandcol, ] ) if not record.qname in newdict: newdict[record.qname] = "\t".join(curr_anno_arr) if not Included(record, True, mapq_thred): # not unique newdict[record.qname] = [newdict[record.qname]] else: if type(newdict[record.qname]) is str: newdict[record.qname] = [newdict[record.qname]] newdict[record.qname].append("\t".join(curr_anno_arr)) IsMapped = True newanno = dict(results_dict.items() + newdict.items()) return newanno
def Main(): args=ParseArg() hist_n=args.hist_n clu_n=args.clu_n File=args.input #read emission matrix and store in Rpy2 print "#Reading emission matrix from" emission=args.emission print '\t'+emission robjects.r("emission=read.table('"+emission+"',header=T,sep='\t')") robjects.r("emission=emission[c(12,11,13,8,7,10,6,9,4,5,2,1,3,15,14),match(c('H3K4me3','H3K4me2','H3K4me1','H3K27me3','H3K36me3','H3K27ac','H2AZ'),colnames(emission))]") state_n=robjects.r("dim(emission)[1]")[0] # number of chromatin state color_state=['red','pink','purple','DarkOrange','Orange','Gold','yellow','DeepSkyBlue','ForestGreen','Green','Lime','GreenYellow','LightCyan','white','white'] #Find overall distribution of all chromatin states print "Counting distribution of chromatin states..." chromHMM_segment = TableIO.parse(args.segment,'bed') #count represent overall probability distribution of all chromatin states count=np.zeros(state_n) num=0 for segment in chromHMM_segment: num=num+1 i=int(segment.id[1:]) count[i-1]+=(segment.stop-segment.start)/200 print 'Reading %d segments... [for distribution of chromatin states]'%(num),'\r', print ## read and index histone pattern data for single nucleosomes in all populations print "Indexing histone pattern data for single nucleosomes in all populations..." data=TableIO.parse(File,'metabed',header=True) ## generate bed file for chromatin states in nucleosomes to be uploaded in UCSC genome browser if args.bed: name=os.path.basename(File).split('.')[0] outbed=open(name+"_State_browser.bed",'w') print "## Start generate BED9 file for uploading..." print >>outbed,'track name="ChromatinState" description="'+name+'" visibility=2 itemRgb="On"' #print >>outbed,'chr\tstart\tend\t'+'\t'.join('P_%d'%(s+1) for s in range(clu_n)) for n,i in enumerate(data): matrix=np.array(str(i).split('\t')[8:(8+hist_n*clu_n)],dtype="int").reshape(hist_n,clu_n,order="F") # matrix of histone patterns, row: histone, column: population if n % 50000 == 0: print "\tWriting %dth nucleosomes into BED9 file,\r"%(n), line='\t'.join (str(f) for f in [i.chr,i.start,i.stop]) for k in range(clu_n): state=histone2state(matrix.T[k],count) color_code=','.join (str(int(f)) for f in np.array(matplotlib.colors.colorConverter.to_rgb(color_state[state-1]))*255) print >>outbed,'\t'.join (str(f) for f in [i.chr,i.start,i.stop,'P_%d_%d'%(k+1,state),0,'.',i.start,i.stop,color_code]) line=line+'\t%d'%(state) #print >>outbed,line outbed.close() sys.exit(1) # read region information region=args.region chro=region.split(":")[0] start=int(region.split(":")[1].split("-")[0]) end=int(region.split(":")[1].split("-")[1]) print "#Query region:["+chro+": %d-%d]"%(start,end) y_nucle=0.47 #location of nucleosome line ## query data in region dbi=binindex(data) query=dbi.query(Bed([chro,start,end])) ## initialize figure fig=plt.figure(figsize=(10,6)) ax = plt.subplot(111,frameon=False,yticks=[]) ax.set_xlim(start-(end-start)/6,end) n=0 print "##Start draw nucleosomes:" ################################################# ## draw genes from y = y_nucle+0.04*(clu_n+1) #### index the gene.tab file print " ## drawing gene track ..." print " ## Indexing gene.tab ..." gene_dbi=DBI.init(args.genetab,'genebed') print " ## query regions from gene.tab" query_gene=gene_dbi.query(Bed([chro,start,end])) #### determine height of gene track bottoms=[0 for i in range(100)] max_index=0 for i in query_gene: index=0 while(1): if i.start > bottoms[index]: bottoms[index]=i.stop if max_index<index: max_index=index break else: index+=1 gene_track_number=max_index+1 gene_track_height=0.03*gene_track_number+0.02 ax.set_ylim(0.05,1+gene_track_height+0.01) print " ## start draw gene track" # add frame for gene track rect=matplotlib.patches.Rectangle((start,y_nucle+0.04),end-start, gene_track_height, edgecolor='black',fill=False) ax.add_patch(rect) bottoms=[0 for i in range(100)] for i in gene_dbi.query(Bed([chro,start,end])): index=0 while(1): if i.start > bottoms[index]: addGeneToFig(i,ax,start,end,1,0.03*index+y_nucle+0.05) bottoms[index]=i.stop break index+=1 ################################################# top_heatmap_y = 0.71+gene_track_height # the y axis value for bottom of top heatmaps print "## Draw nucleosome tracks..." for i in query: n=n+1 print " Nucleosome %d\t at "%(n)+chro+": %d-%d"%(i.start,i.stop) matrix=np.array(str(i).split('\t')[8:(8+hist_n*clu_n)],dtype="int").reshape(hist_n,clu_n,order="F") # matrix of histone patterns, row: histone, column: population prob=np.array(str(i).split('\t')[(8+hist_n*clu_n):],dtype=float) ax.plot([i.smt_pos,i.smt_pos],[y_nucle+0.03,y_nucle],color='r') #red nucleosome midpoint rect=matplotlib.patches.Rectangle((i.start,y_nucle), i.stop-i.start, 0.03, color='#EB70AA') #pink nucleosome region ax.add_patch(rect) for j in range(clu_n): state=histone2state(matrix.T[j],count) state_rect=matplotlib.patches.Rectangle((i.start,y_nucle+0.04*(j+1)+gene_track_height+0.01), i.stop-i.start, 0.03, color=color_state[state-1]) ax.add_patch(state_rect) im = OffsetImage(matrix, interpolation='nearest',zoom=10/(1+gene_track_height+0.01),cmap=plt.cm.binary,alpha=0.5) if n<=9: xybox=((n+0.5)/10.0,top_heatmap_y) xy = [i.smt_pos,y_nucle+0.04*clu_n+0.03+gene_track_height+0.01] xytext=((n+0.7)/10.0,top_heatmap_y) c_style="bar,angle=180,fraction=-0.1" elif n<=18: xybox=((n-9+0.5)/10.0,0.2) xy = [i.smt_pos,y_nucle] xytext = ((n-9+0.7)/10.0,0.40) c_style="bar,angle=180,fraction=-0.1" else: print "WARN: nucleosome number larger than 18 in this region, only plot the pattern for first 18 nucleosomes" break ab = AnnotationBbox(im, xy, xybox=xybox, xycoords='data', boxcoords=("axes fraction", "data"), box_alignment=(0.,0.), pad=0.1) ax.annotate("",xy, xytext=xytext, xycoords='data', textcoords=("axes fraction", "data"), arrowprops=dict(arrowstyle="->",connectionstyle=c_style)) #arrowprops=None) ax.add_artist(ab) # add mark for histone mark and regions with low confidence for i in range(hist_n): if prob[i]<0.6: xy_star=tuple(map(sum,zip(xybox,(0.065,0.03*(hist_n-1-i)-0.01)))) ax.annotate("*",xy=xy_star,xycoords=("axes fraction", "data"),color='red') ax.annotate('Nucleosome:', xy=(start-(end-start)/6, y_nucle), xycoords='data',size=12) ax.annotate('Epigenetic Pattern:', xy=(start-(end-start)/6, 0.23+top_heatmap_y), xycoords='data',size=12) ax.annotate(chro, xy=(start-(end-start)/6, 0.1), xycoords='data',size=12) name=open(File).readline().split('\t')[8:(8+hist_n)] for n,i in enumerate(name): ax.annotate(i.split("_")[0],xy=(start-(end-start)/8, top_heatmap_y+0.03*(hist_n-1-n)),xycoords='data',size=10) ax.annotate(i.split("_")[0],xy=(start-(end-start)/8, 0.2+0.03*(hist_n-1-n)),xycoords='data',size=10) # flame for nucleosome and chromatin state tracks rect=matplotlib.patches.Rectangle((start,y_nucle),end-start, 0.03, edgecolor='black',fill=False) ax.add_patch(rect) for k in range(clu_n): rect=matplotlib.patches.Rectangle((start,y_nucle+0.04*(k+1)+gene_track_height+0.01),end-start, 0.03, edgecolor='grey',fill=False) ax.add_patch(rect) ax.annotate('Population%d'%(k+1),xy=(start-(end-start)/6, y_nucle+0.04*(k+1)+gene_track_height+0.01),xycoords='data',size=12) # chromatin state legend for s in range(state_n): dist=(end-start)*1.0/state_n length=dist*0.75 rect=matplotlib.patches.Rectangle((start+dist*s,0.1), length, 0.03, color=color_state[s]) ax.add_patch(rect) ax.annotate(s+1,xy=(start+dist*s+length/3,0.075),xycoords='data',size=10) ax.annotate("Chromatin states:",xy=(start,0.14),xycoords='data',size=12) ax.add_patch(matplotlib.patches.Rectangle((start-length/6,0.07),end-start, 0.1, edgecolor='grey',fill=False)) plt.title("Region: ["+chro+": %d-%d]"%(start,end),size=14) plt.savefig(args.output) plt.close()
def Main(): args = ParseArg() pair_dist = args.pair_dist step = args.step print "\nChecking if linkedPair file is tabixed..." if not os.path.isfile(args.linkedPair): print "LinkedPair file is not exist, please check!!" sys.exit(0) if not os.path.isfile(args.linkedPair + ".tbi"): print " tabix-ing..." os.system("sort -k1,1 -k2,2n " + args.linkedPair + " > temp_linkedPair.txt") os.system("bgzip temp_linkedPair.txt") os.system("tabix -p bed temp_linkedPair.txt.gz") linkedPair = 'temp_linkedPair.txt.gz' else: linkedPair = args.linkedPair print " linkedPair file is tabixed." print "\nTabixing the interaction file..." os.system("sort -k1,1 -k2,2n " + args.interaction + " > temp_interaction.txt") os.system("bgzip temp_interaction.txt") os.system("tabix -p bed temp_interaction.txt.gz") print " interaction file is tabixed." # start column number for second regions # s1 for interaction file and s2 for linkedPair file (s1, s2) = args.start print "\nGet region information." if args.r: Region = read_region(args.r) elif args.name: os.system('grep "%s" %s > temp2.txt' % (args.name, args.genebed)) g = open("temp2.txt").read().split('\t') if len(g) < 2: print >> sys.stderr, "Error: the gene name is not found in database" sys.exit(0) s = int(g[1]) e = int(g[2]) Region = Bed( [g[0], s - (e - s) / 10, e + (e - s) / 10, "region", ".", "."]) else: print >> sys.stderr, "Error: Need to specify the region by '-r' or specify the gene name by '-n'" sys.exit(0) print "\n Start plot heatmaps on region: " + Region.str_region() fig = plt.figure(figsize=(8, 6)) ax = plt.subplot(111, frameon=False, yticks=[]) start = Region.start end = Region.stop ax.set_xlim(start, end) #set x ticks withour offset locs = ax.get_xticks() ax.set_xticklabels(map(lambda x: "%i" % x, locs), fontsize=6) print "\nStart draw gene track" gene_dbi = DBI.init(args.genebed, "bed") print " genebed indexed!" print " Plot gene track" gene_top = Genetrack(Region, gene_dbi, ax, 0.08) h = 1.5 * step / (end - start ) # unit height for triangles or polycons in heatmap print "\nQuery linkedPairs within specified region" os.system("tabix " + linkedPair + " %s:%i-%i > temp2.txt" % (Region.chr, Region.start, Region.stop)) Count = {} for b in read_interaction("temp2.txt", s2): col = 'k' if args.Slim and SingleFragment(b[0], b[1], pair_dist): continue if Region.overlap(b[0], 0) and Region.overlap(b[1], 0): if b[0].strand == '-': i = b[0].start else: i = b[0].stop if b[1].strand == '-': j = b[1].start else: j = b[1].stop i = (i / step + 1) * step # approximate to the nearest central point j = (j / step + 1) * step if i > j: temp = j j = i i = temp if (i, j) not in Count: Count[(i, j)] = 1 else: Count[(i, j)] += 1 print Count patches = [] colors = [] for i in range(start, end + 1): if i % step != 0: continue for j in range(i, end + 1): if j % step != 0 or (i, j) not in Count: continue patches.append(PatchGen(i, j, h, step, gene_top + 0.01)) colors.append(np.log(Count[(i, j)] + 1)) p = PatchCollection(patches, cmap=matplotlib.cm.Reds, alpha=0.7, edgecolor='k', linewidths=0.1) p.set_array(np.array(colors)) ax.add_collection(p) ax.set_ylim(0, ((end - start) / step + 2) * h + gene_top + 0.01) plt.colorbar(p) if not args.SI: plt.savefig(args.output) plt.show() os.system("rm temp_interaction.txt.gz*") if not os.path.isfile(args.linkedPair + ".tbi"): os.system("rm temp_linkedPair.txt.gz*") os.system("rm temp2.txt") sys.exit(0) print "\nQuery interactions" os.system("tabix temp_interaction.txt.gz %s:%i-%i > temp2.txt" % (Region.chr, Region.start, Region.stop)) print "\nList of interactions plotted: " k = 1 cmap = cm.get_cmap('Paired', 10) cmap = cmap(range(10)) bottom = gene_top + 0.01 for b in read_interaction("temp2.txt", s1): if b[0].overlap(b[1], 0): continue if Region.overlap(b[1], 0): k += 1 if b[1].stop > b[0].stop: start1 = b[0].start end1 = b[0].stop start2 = b[1].start end2 = b[1].stop else: start1 = b[1].start end1 = b[1].stop start2 = b[0].start end2 = b[0].stop P1 = Polygon([[start1, bottom], [end1, bottom], [(end1 + end2) * 0.5, (end2 - end1) * h / step + bottom], [(start1 + end2) * 0.5, (end2 - start1) * h / step + bottom]], "True", facecolor='none', edgecolor=cmap[k % 10], alpha=0.4, lw=0.5) P2 = Polygon([[start2, bottom], [end2, bottom], [(start1 + end2) * 0.5, (end2 - start1) * h / step + bottom], [(start1 + start2) * 0.5, (start2 - start1) * h / step + bottom]], "True", facecolor='none', edgecolor=cmap[k % 10], alpha=0.4, lw=0.5) ax.add_patch(P1) ax.add_patch(P2) print " " + b[0].str_region() + " <-> " + b[1].str_region() plt.savefig(args.output) plt.show() # remove temp file os.system("rm temp_interaction.txt.gz*") if not os.path.isfile(args.linkedPair + ".tbi"): os.system("rm temp_linkedPair.txt.gz*") os.system("rm temp2.txt")
def Main(): t1=time() global min_interaction, p_value args=ParseArg() inp = open(args.input, 'r') min_clusterS=args.min_clusterS min_interaction=args.min_interaction p_value=args.p_value output=open(args.output,'w') ncpus=args.parallel #store genomic location of part1 and part2 part1=[] part2=[] k=0 print >> sys.stderr,"# Inputing data..." chr_list=[] for line in inp.read().split('\n'): if line=='': continue line=line.strip().split('\t') p1=annotated_bed(line[0:10],id=k) p2=annotated_bed(line[11:],id=k) if isinstance(p1.start, list): p1.start=int(p1.start[0]) p1.end=int(p1.end[-1]) if isinstance(p2.start, list): p2.start=int(p2.start[0]) p2.end=int(p2.end[-1]) if SingleFragment(p1,p2): continue k+=1 part1.append(p1) part2.append(p2) if p1.chr not in chr_list: chr_list.append(p1.chr) if p2.chr not in chr_list: chr_list.append(p2.chr) if k%20000==0: print >> sys.stderr," Reading %d pairs of segments\r"%(k), print >> sys.stderr,"Get total %d pairs."%(k) if len(part1)!=len(part2): print >> sys.stderr, "## ERROR: number of regions in two part not match!!" sys.exit(0) # sort in genomic order, easy for clustering part1=sorted(part1, key=attrgetter('start')) part1=sorted(part1, key=attrgetter('chr')) part2=sorted(part2, key=attrgetter('start')) part2=sorted(part2, key=attrgetter('chr')) # for parallel computing print >>sys.stderr,"# Generating clusters for two parts..." # tuple of all parallel python servers to connect with ppservers = () job_server = pp.Server(ncpus, ppservers=ppservers) jobs1=[] jobs2=[] for chro in chr_list: part1_temp=filter(lambda p: p.chr==chro, part1) if len(part1_temp)>0: jobs1.append(job_server.submit(cluster_regions,(part1_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",))) part2_temp=filter(lambda p: p.chr==chro, part2) if len(part2_temp)>0: jobs2.append(job_server.submit(cluster_regions,(part2_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",))) cluster_pool1={} part1=[] for job in jobs1: try: part1=part1+job()[1] cluster_pool1.update(job()[0]) except: print >> sys.stderr, "Wrong in %s, part1"%(job()[2]) continue cluster_pool2={} part2=[] for job in jobs2: try: part2=part2+job()[1] cluster_pool2.update(job()[0]) except: continue print >>sys.stderr," cluster number for part1 is %d "%(len(cluster_pool1)) print >>sys.stderr," cluster number for part2 is %d "%(len(cluster_pool2)) # sort back to pair two parts part1=sorted(part1, key=attrgetter('id')) part2=sorted(part2, key=attrgetter('id')) print >> sys.stderr,"size of part1&2:",len(part1),len(part2) c_interaction={} for i in range(len(part1)): region1=str(part1[i]) region2=str(part2[i]) try: inter=part1[i].cluster+"--"+part2[i].cluster except: print >> sys.stderr,i,part1[i].cluster,part2[i].cluster sys.exit() if c_interaction.has_key(inter): c_interaction[inter]+=1 else: c_interaction[inter]=1 # annotation file print >> sys.stderr,"# Indexing annotation files" dbi_all=DBI.init(args.annotation,"bed") dbi_detail=DBI.init(args.db_detail,"bed") dbi_repeat=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed") print >> sys.stderr,"# finding strong interactions from clusters..." k=0 # record for strong interactions n=0 # annotation file for interaction in c_interaction: n=n+1 count=c_interaction[interaction] if count<min_interaction: continue i=interaction.split("--")[0] j=interaction.split("--")[1] try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1=cluster_pool1[i].cluster count2=cluster_pool2[j].cluster except: continue real_p=1-hypergeom.cdf(count,len(part1),count1,count2) if real_p<=p_value: k=k+1 cluster_pool1[i].Annotate(dbi_all,dbi_detail,dbi_repeat) cluster_pool2[j].Annotate(dbi_all,dbi_detail,dbi_repeat) try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output,str(cluster_pool1[i])+'\t'+str(cluster_pool2[j])+'\t%d\t%.4f'%(count,log_p) if n%1000==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(c_interaction)), print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1) if args.FDR: print >> sys.stderr, "# Permutated results:" for i in range(10): shuffle(part2) [n_r_I,n_r_SI]=Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2) print >> sys.stderr, " ",i, n_r_I, n_r_SI, n_r_SI*1.0/n_r_I
def Main(): t1=time() args=ParseArg() inp = open(args.input, 'r') min_clusterS=args.min_clusterS min_interaction=args.min_interaction p_value=args.p_value output=open(args.output,'w') outputIntra = open(args.output_intra, 'w') hasAnnotation = False if args.annotation: dbi = DBI.init(args.annotation, "bed") hasAnnotation = True else: dbi = False if args.annotation_repeat: dbirepeat = DBI.init(args.annotation_repeat, "bed") hasAnnotationRepeat = True else: dbirepeat = False #store count of RNA for part1 and part2 part={} k=0 sgcount = 0 #single fragment count print >> sys.stderr,"# Inputing data..." interaction = {} # store number of interactions for different RNA selfinteraction = {} #Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript","sense_overlapping","rRNA_repeat","rRNA"] for line in inp.read().split('\n'): if line=='': continue line=line.strip().split('\t') p1=annotated_bed_proper(line[0:10],id=k,cluster=1) p2=annotated_bed_proper(line[11:],id=k,cluster=1) if isinstance(p1.start, list): p1.start=int(p1.start[0]) p1.end=int(p1.end[-1]) if isinstance(p2.start, list): p2.start=int(p2.start[0]) p2.end=int(p2.end[-1]) if SingleFragment(p1,p2): sgcount += 1 continue k+=1 #if p1.subtype=="intron" or p2.subtype=="intron": continue #if p1.type in Types: try: p1_name = GetAnnotationName(p1, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) if p1_name not in part: part[p1_name]=1 else: part[p1_name]+=1 #if p2.type in Types: p2_name = GetAnnotationName(p2, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) if not p1_name == p2_name: # count once for self-interaction if p2_name not in part: part[p2_name]=1 else: part[p2_name]+=1 #if p1.type in Types and p2.type in Types: if p1_name == p2_name: if p1_name not in selfinteraction: selfinteraction[p1_name]=copy.deepcopy(p1) else: selfinteraction[p1_name].Update(p1.start, p1.end) selfinteraction[p1_name].Update(p2.start, p2.end) selfinteraction[p1_name].cluster += 1 else: if p1_name>p2_name: temp = p1 p1 = p2 p2 = temp tempName = p1_name p1_name = p2_name p2_name = tempName inter_name = p1_name + "--" + p2_name if inter_name not in interaction: interaction[inter_name]=[copy.deepcopy(p1),copy.deepcopy(p2)] else: interaction[inter_name][0].Update(p1.start,p1.end) interaction[inter_name][1].Update(p2.start,p2.end) interaction[inter_name][0].cluster+=1 except Exception as e: print >> sys.stderr, e if k%20000==0: print >> sys.stderr," Reading %d pairs of segments\r"%(k), print >> sys.stdout,"Get total %d pairs."%(k) print >> sys.stdout,"Single fragment count: %d."%(sgcount) print >>sys.stdout," number of different RNAs is %d "%(len(part)) total = k # total pairs used n=0 k=0 # record number of strong interactions for i in interaction: n+=1 count = interaction[i][0].cluster if count < min_interaction: continue p1_name = i.split("--")[0] p2_name = i.split("--")[1] P1 = interaction[i][0] P2 = interaction[i][1] P1.cluster = part[p1_name] P2.cluster = part[p2_name] if part[p1_name]<min_clusterS or part[p2_name]<min_clusterS: continue real_p=1-hypergeom.cdf(count,total,part[p1_name],part[p2_name]) if real_p<=p_value: k=k+1 try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output, str(P1)+'\t'+str(P2)+'\t%d\t%.4f'%(count,log_p) if n%500==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(interaction)), k1=0 for i in selfinteraction: n+=1 count = selfinteraction[i].cluster if count < min_interaction: continue p1_name = i P1 = selfinteraction[i] P1.cluster = part[p1_name] if part[p1_name]<min_clusterS: continue k1=k1+1 print >> outputIntra, str(P1)+'\t%d'%(count) if n%500==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(interaction)), print >> sys.stdout,"# Find %d strong and %d self interactions. Cost time: %.2f s"%(k,k1,time()-t1)
def Main(): """ IO TEMPLATE """ global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >>sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout if args.input == "stdin": fin = sys.stdin else: try: x = args.input.split(".") if x[-1] == "gz": fin = gzip.open(args.input, "r") else: fin = open(args.input, "r") except IOError: print >>sys.stderr, "can't read file", args.input fin = sys.stdin """ END OF IO TEMPLATE """ print >> out, "# This data was generated by program ", sys.argv[0], " (version: %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :" print >> out, "#\t", " ".join(sys.argv) gene = DBI.init(args.genetab, args.gene_format) upstream_list = [] downstream_list = [] exons_list = [] introns_list = [] utr3_list = [] utr5_list = [] for g in gene: upstream_list.append(g.upstream(args.upstream)) downstream_list.append(g.downstream(args.downstream)) for e in g.Exons(): exons_list.append(e) for i in g.Introns(): introns_list.append(i) if not (g.utr3() is None): utr3_list.append(g.utr3()) if not (g.utr5() is None): utr5_list.append(g.utr5()) upstream = DBI.init(upstream_list, "bed") downstream = DBI.init(downstream_list, "bed") exons = DBI.init(exons_list, "bed") introns = DBI.init(introns_list, "bed") utr3 = DBI.init(utr3_list, "genebed") utr5 = DBI.init(utr5_list, "genebed") if args.format == "guess": args.format = Tools.guess_format(args.input) for (i0, i) in enumerate(TableIO.parse(fin, args.format)): if i0 == 0: if isinstance(i, Bed12): print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" elif isinstance(i, GeneBed): print >> out, "#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" else: print >> out, "#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" print >> out, i, print >> out, "\t", toIDs(gene.query(i)), print >> out, "\t", toIDs(upstream.query(i)), print >> out, "\t", toIDs(downstream.query(i)), print >> out, "\t", toIDs(exons.query(i)), print >> out, "\t", toIDs(introns.query(i)), print >> out, "\t", toIDs(utr3.query(i)), print >> out, "\t", toIDs(utr5.query(i))
def Main(): args=ParseArg() distance=args.distance*1000 pair_dist=args.pair_dist print "\nChecking if linkedPair file is tabixed..." if not os.path.isfile(args.linkedPair): print "LinkedPair file is not exist, please check!!" sys.exit(0) if not os.path.isfile(args.linkedPair+".tbi"): print " tabix-ing..." os.system("sort -k1,1 -k2,2n "+args.linkedPair+" > temp_linkedPair.txt") os.system("bgzip temp_linkedPair.txt") os.system("tabix -p bed temp_linkedPair.txt.gz") linkedPair='temp_linkedPair.txt.gz' else: linkedPair=args.linkedPair print " linkedPair file is tabixed." print "\nTabixing the interaction file..." os.system("sort -k1,1 -k2,2n "+args.interaction+" > temp_interaction.txt") os.system("bgzip temp_interaction.txt") os.system("tabix -p bed temp_interaction.txt.gz") print " interaction file is tabixed." # start column number for second regions # s1 for interaction file and s2 for linkedPair file (s1,s2)=args.start print "\nExtracting interaction information..." if args.n: Interactions=open(args.interaction,'r') l=Interactions.read().split('\n')[args.n-1].split('\t') part1=Bed(l[0:6]) part2=Bed(l[s1:(s1+6)]) elif len(args.r)==2: part1=read_region(args.r[0]) part2=read_region(args.r[1]) else: print >> sys.stderr, "need to specify two regions using '-r'" if "chr" not in part1.chr or "chr" not in part2.chr: print >> sys.stderr, "This program only works for genomic regions." exit(0) start1=part1.start-distance end1=part1.stop+distance start2=part2.start-distance end2=part2.stop+distance # if the searched regions for part1 and part2 are overlapped, using the same regions for both part if part1.overlap(part2,-2*distance): start1=min(start1,start2) start2=min(start1,start2) end1=max(end1,end2) end2=max(end1,end2) # initialize figure print "\n Start plot interaction: "+part1.str_region()+" <-> "+part2.str_region() col1="#4F81BD" col2="#C0504D" fig = plt.figure(figsize=(8,4)) ax1 = plt.subplot(111,frameon=False,yticks=[]) plt.tick_params(axis="y",which="both",left="off",right="off",labelleft="off") # remove y ticks plt.subplots_adjust(top=0.75) ax2 = ax1.twiny() ax1.set_xlim(start1,end1) ax2.set_xlim(start2,end2) ax1.set_ylim(0,1) ax2.set_ylim(0,1) #set x ticks withour offset locs=ax1.get_xticks() ax1.set_xticklabels(map(lambda x: "%i"%x, locs),fontsize=8) locs=ax2.get_xticks() ax2.set_xticklabels(map(lambda x: "%i"%x, locs),fontsize=8) # input bigWig file for phyloP score bw_phyloP = BigWigFile(open(args.phyloP_wig)) print "\nStart draw gene track" gene_dbi=DBI.init(args.genebed,"bed") print " genebed indexed!" print " Plot gene track for Part1" gene1_top=Genetrack(Bed([part1.chr,start1,end1]),gene_dbi,ax1,0.08) wig1_top=Wigtrack(Bed([part1.chr,start1,end1]), bw_phyloP, ax1, gene1_top,col1) y_1=wig1_top+0.1 y_2=y_1+0.2 print " Plot gene track for Part2" gene2_top=Genetrack(Bed([part2.chr,start2,end2]),gene_dbi,ax2,y_2+0.08) wig2_top=Wigtrack(Bed([part2.chr,start2,end2]), bw_phyloP, ax2, gene2_top,col2) print "\nQuery interactions within +-%dkbp of interaction"%(distance/1000) os.system("tabix temp_interaction.txt.gz %s:%i-%i > temp2.txt"%(part1.chr,start1,end1)) print "\nList of interactions plotted: " k=1 cmap=cm.get_cmap('Paired', 10) cmap=cmap(range(10)) for b in read_interaction("temp2.txt",s1): #if args.Slim and b[0].overlap(b[1],-pair_dist): continue if Bed([part2.chr,start2,end2]).overlap(b[1],0): k+=1 x1_2_start=transform(b[0].start,start1,end1,start2,end2) x1_2_end=transform(b[0].stop,start1,end1,start2,end2) ax2.add_patch(matplotlib.patches.Polygon([[x1_2_start,y_1+0.04],[x1_2_end,y_1+0.04],[b[1].stop,y_2],[b[1].start,y_2]],color=cmap[k%10],alpha=0.4,lw=0.5)) ax1.add_patch(matplotlib.patches.Rectangle((b[0].start,y_1),b[0].stop-b[0].start,0.04,color=col1,lw=0.5)) ax2.add_patch(matplotlib.patches.Rectangle((b[1].start,y_2),b[1].stop-b[1].start,0.04,color=col2,lw=0.5)) print " "+b[0].str_region()+" <-> "+b[1].str_region() ax1.plot([start1,end1],[y_1+0.02,y_1+0.02],color=col1,linewidth=1,alpha=0.7) ax2.plot([start2,end2],[y_2+0.02,y_2+0.02],color=col2,linewidth=1,alpha=0.7) print "\nQuery linkedPairs within +-%dkbp of interaction"%(distance/1000) os.system("tabix "+linkedPair+" %s:%i-%i > temp2.txt"%(part1.chr,start1,end1)) print "\nList of linked pairs plotted: " for b in read_interaction("temp2.txt",s2): col='k' if args.Slim and SingleFragment(b[0],b[1],pair_dist): continue if SingleFragment(b[0],b[1],pair_dist): col='#03C03C' if part1.overlap(b[0],-distance) and part2.overlap(b[1],-distance): x1_2_start=transform(b[0].start,start1,end1,start2,end2) x1_2_end=transform(b[0].stop,start1,end1,start2,end2) if b[0].strand=='-': connect1=x1_2_start else: connect1=x1_2_end if b[1].strand=="-": connect2=b[1].start else: connect2=b[1].stop ax2.plot([connect1,connect2],[y_1+0.02,y_2+0.02],color=col,alpha=0.3,lw=0.5) ax1.plot([b[0].start,b[0].stop],[y_1+0.02,y_1+0.02],color=col,alpha=0.3,lw=0.8) ax2.plot([b[1].start,b[1].stop],[y_2+0.02,y_2+0.02],color=col,alpha=0.3,lw=0.8) # print " "+b[0].str_region()+" <-> "+b[1].str_region() plt.text(0.5, 1.15, part1.str_region()+" <-> "+part2.str_region(), horizontalalignment='center', fontsize=10, transform = ax1.transAxes) plt.text(0.5, 1.10, "Distance: +-%dkbp of interaction"%(distance/1000), horizontalalignment='center', fontsize=8, transform = ax1.transAxes) ax1.text(part1.center,y_1-0.03,"|".join([part1.type,part1.name,part1.subtype]), verticalalignment='center', horizontalalignment='center',fontsize=8,color=col1) ax2.text(part2.center,y_2+0.07,"|".join([part2.type,part2.name,part2.subtype]), verticalalignment='center', horizontalalignment='center',fontsize=8,color=col2) ax1.set_ylim(0,wig2_top+0.1) ax2.set_ylim(0,wig2_top+0.1) ax1.text(start1, 0.05, part1.chr,horizontalalignment='left',fontsize=8) ax2.text(start2, wig2_top+0.04, part2.chr,horizontalalignment='left',fontsize=8) plt.savefig(args.output) plt.show() # remove temp file os.system("rm temp_interaction.txt.gz*") if not os.path.isfile(args.linkedPair+".tbi"): os.system("rm temp_linkedPair.txt.gz*") os.system("rm temp2.txt")
def Main(): args = ParseArg() fastq1 = open("simulated_" + str(args.num) + "_read_R1.fastq", "w") fastq2 = open("simulated_" + str(args.num) + "_read_R2.fastq", "w") RNA = TableIO.parse(args.annotation, 'bed') # create a dictionary for all RNAs pools except rRNA RNAs = {} for b in RNA: if b.id.startswith('rRNA'): continue if b.chr.startswith('chrM') or b.chr.startswith('chrNT'): continue Type = b.id.split(".")[0] if Type in RNAs: RNAs[Type].append(b) else: RNAs[Type] = [b] #---------------- read linker seq ------------------ linkers = [] for i in open(args.linker, 'r'): i = i.strip() linkers.append(i) #--------------------------------------------------- #---------------- read barcode --------------------- barcodes = [] for i in open(args.barcode, 'r'): i = i.strip() barcodes.append(i) #--------------------------------------------------- # sample different classes: LinkerOnly, Nolinker, RNA1-linker, linker-RNA2, RNA1-linker-RNA2 xk = range(5) pk = args.parameter custm = stats.rv_discrete(name='custm', values=(xk, pk)) Class_index = custm.rvs(size=args.num) # specify output out = open(args.output, 'w') # initiate the annotation database if args.db_detail: print >> sys.stderr, " # Index for the annotation database" dbi1 = DBI.init(args.annotation, "bed") dbi2 = DBI.init(args.db_detail, "bed") dbi3 = DBI.init( "/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt", "bed") print >> sys.stderr, " # Start to simulate reads" t0 = time.time() for i in range(0, args.num): pair_id = "read_" + str(i) # barcode randSeq = "".join([random.choice("ACGT") for x in range(6)]) barcode = randSeq[0:4] + barcodes[0] + randSeq[4:6] index = Class_index[i] # index for different classes of fragments # Sample RNA1 and RNA2 RNA1_len = random.randrange(15, 150) b, Type = randRegion(RNA1_len, RNAs) RNA1_seq = fetchSeq(b.chr, b.start, b.stop, b.strand, args.genomeFa, args.spath) if args.db_detail: [name1, typ1, subtype1] = annotation(b, dbi1, dbi2, dbi3) RNA1_str = "\t".join( str(f) for f in [b.chr, b.start, b.stop, b.strand, name1, typ1, subtype1]) else: RNA1_str = "\t".join( str(f) for f in [b.chr, b.start, b.stop, b.strand, Type]) RNA2_len = random.randrange(15, 150) b, Type = randRegion(RNA2_len, RNAs) RNA2_seq = fetchSeq(b.chr, b.start, b.stop, b.strand, args.genomeFa, args.spath) if args.db_detail: [name2, typ2, subtype2] = annotation(b, dbi1, dbi2, dbi3) RNA2_str = "\t".join( str(f) for f in [b.chr, b.start, b.stop, b.strand, name2, typ2, subtype2]) else: RNA2_str = "\t".join( str(f) for f in [b.chr, b.start, b.stop, b.strand, Type]) # fragment is the recovered cDNA fragment if index == 1: # single RNA or RNA1-RNA2 if random.choice([0, 1]) == 0: # single RNAs fragment = barcode + RNA1_seq + RNA2_seq print >> out, pair_id + "\t%d\tRNA1-RNA2\t0" % ( len(fragment)) + "\t" + RNA1_str + '\t' + RNA2_str else: # RNA1-RNA2 fragment = barcode + RNA1_seq print >> out, pair_id + "\t%d\tsingleRNA\t0" % ( len(fragment)) + "\t" + RNA1_str else: linker_n = random.choice([1, 2]) # number of linkers in fragment linker = "".join([linkers[0]] * linker_n) if index == 0: fragment = barcode + linker print >> out, pair_id + "\t%d\tlinkerOnly\t%d" % ( len(fragment), linker_n) elif index == 2: fragment = barcode + RNA1_seq + linker print >> out, pair_id + "\t%d\tRNA1-linker\t%d" % ( len(fragment), linker_n) + "\t" + RNA1_str elif index == 3: fragment = barcode + linker + RNA2_seq print >> out, pair_id + "\t%d\tlinker-RNA2\t%d" % ( len(fragment), linker_n) + "\t" + RNA2_str elif index == 4: fragment = barcode + RNA1_seq + linker + RNA2_seq print >> out, pair_id + "\t%d\tRNA1-linker-RNA2\t%d" % (len( fragment), linker_n) + "\t" + RNA1_str + "\t" + RNA2_str read1, read2 = generatePairs(fragment, args.len, args.errorRate) score = [] for j in range(0, args.len): score.append(random.randrange(10, 40)) record1 = SeqRecord(Seq(read1, generic_dna), id=pair_id) record1.letter_annotations["phred_quality"] = score record2 = SeqRecord(Seq(read2, generic_dna), id=pair_id) record2.letter_annotations["phred_quality"] = score SeqIO.write(record1, fastq1, "fastq") SeqIO.write(record2, fastq2, "fastq") if i % 100 == 0: print >> sys.stderr, "generate pairs %d\r" % (i), fastq1.close() fastq2.close() out.close() print time.time() - t0
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) gene=DBI.init(args.genetab,args.gene_format); upstream_list=[] downstream_list=[] exons_list=[] introns_list=[] utr3_list=[] utr5_list=[] for g in gene: upstream_list.append(g.upstream(args.upstream)); downstream_list.append(g.downstream(args.downstream)); for e in g.Exons(): exons_list.append(e) for i in g.Introns(): introns_list.append(i) if not (g.utr3() is None): utr3_list.append(g.utr3()) if not (g.utr5() is None): utr5_list.append(g.utr5()) upstream=DBI.init(upstream_list,"bed") downstream=DBI.init(downstream_list,"bed") exons=DBI.init(exons_list,"bed") introns=DBI.init(introns_list,"bed") utr3=DBI.init(utr3_list,"genebed") utr5=DBI.init(utr5_list,"genebed") if args.format=="guess": args.format=Tools.guess_format(args.input) for (i0,i) in enumerate(TableIO.parse(fin,args.format)): if i0==0: if isinstance(i,Bed12): print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tblock_count\tblock_sizes\tblock_starts\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" elif isinstance(i,GeneBed): print >>out,"#name\tchr\tstrand\tstart\tend\tcds_start\texon_count\texon_starts\texont_ends\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" else: print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" print >>out,i, print >>out,"\t",toIDs(gene.query(i)), print >>out,"\t",toIDs(upstream.query(i)), print >>out,"\t",toIDs(downstream.query(i)), print >>out,"\t",toIDs(exons.query(i)), print >>out,"\t",toIDs(introns.query(i)), print >>out,"\t",toIDs(utr3.query(i)), print >>out,"\t",toIDs(utr5.query(i))
def Main(): args=ParseArg() fastq1=open("simulated_"+str(args.num)+"_read_R1.fastq","w") fastq2=open("simulated_"+str(args.num)+"_read_R2.fastq","w") RNA=TableIO.parse(args.annotation,'bed') # create a dictionary for all RNAs pools except rRNA RNAs = {} for b in RNA: if b.id.startswith('rRNA'): continue if b.chr.startswith('chrM') or b.chr.startswith('chrNT'): continue Type = b.id.split(".")[0] if Type in RNAs: RNAs[Type].append(b) else: RNAs[Type]=[b] #---------------- read linker seq ------------------ linkers=[] for i in open(args.linker,'r'): i=i.strip() linkers.append(i) #--------------------------------------------------- #---------------- read barcode --------------------- barcodes=[] for i in open(args.barcode,'r'): i=i.strip() barcodes.append(i) #--------------------------------------------------- # sample different classes: LinkerOnly, Nolinker, RNA1-linker, linker-RNA2, RNA1-linker-RNA2 xk = range(5) pk = args.parameter custm = stats.rv_discrete(name='custm', values=(xk, pk)) Class_index = custm.rvs(size=args.num) # specify output out = open(args.output,'w') # initiate the annotation database if args.db_detail: print >> sys.stderr, " # Index for the annotation database" dbi1=DBI.init(args.annotation,"bed") dbi2=DBI.init(args.db_detail,"bed") dbi3=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed") print >> sys.stderr, " # Start to simulate reads" t0 = time.time() for i in range(0,args.num): pair_id = "read_"+str(i) # barcode randSeq = "".join([random.choice("ACGT") for x in range(6)]) barcode = randSeq[0:4]+barcodes[0]+randSeq[4:6] index = Class_index[i] # index for different classes of fragments # Sample RNA1 and RNA2 RNA1_len = random.randrange(15,150) b,Type = randRegion(RNA1_len,RNAs) RNA1_seq = fetchSeq(b.chr,b.start,b.stop,b.strand,args.genomeFa,args.spath) if args.db_detail: [name1,typ1,subtype1]=annotation(b,dbi1,dbi2,dbi3) RNA1_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,name1,typ1,subtype1]) else: RNA1_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,Type]) RNA2_len = random.randrange(15,150) b,Type = randRegion(RNA2_len,RNAs) RNA2_seq = fetchSeq(b.chr,b.start,b.stop,b.strand,args.genomeFa,args.spath) if args.db_detail: [name2,typ2,subtype2]=annotation(b,dbi1,dbi2,dbi3) RNA2_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,name2,typ2,subtype2]) else: RNA2_str = "\t".join(str(f) for f in [b.chr,b.start,b.stop,b.strand,Type]) # fragment is the recovered cDNA fragment if index == 1: # single RNA or RNA1-RNA2 if random.choice([0,1])==0: # single RNAs fragment = barcode+RNA1_seq+RNA2_seq print >> out, pair_id+"\t%d\tRNA1-RNA2\t0"%(len(fragment))+"\t"+RNA1_str+'\t'+RNA2_str else: # RNA1-RNA2 fragment = barcode+RNA1_seq print >> out, pair_id+"\t%d\tsingleRNA\t0"%(len(fragment))+"\t"+RNA1_str else: linker_n = random.choice([1,2]) # number of linkers in fragment linker = "".join([linkers[0]]*linker_n) if index == 0: fragment = barcode+linker print >> out, pair_id+"\t%d\tlinkerOnly\t%d"%(len(fragment),linker_n) elif index == 2: fragment = barcode+RNA1_seq+linker print >> out, pair_id+"\t%d\tRNA1-linker\t%d"%(len(fragment),linker_n)+"\t"+RNA1_str elif index == 3: fragment = barcode+linker+RNA2_seq print >> out, pair_id+"\t%d\tlinker-RNA2\t%d"%(len(fragment),linker_n)+"\t"+RNA2_str elif index == 4: fragment = barcode+RNA1_seq+linker+RNA2_seq print >> out, pair_id+"\t%d\tRNA1-linker-RNA2\t%d"%(len(fragment),linker_n)+"\t"+RNA1_str+"\t"+RNA2_str read1,read2 = generatePairs(fragment,args.len,args.errorRate) score=[] for j in range(0, args.len): score.append(random.randrange(10,40)) record1 = SeqRecord(Seq(read1,generic_dna),id=pair_id) record1.letter_annotations["phred_quality"] = score record2 = SeqRecord(Seq(read2,generic_dna),id=pair_id) record2.letter_annotations["phred_quality"] = score SeqIO.write(record1,fastq1,"fastq") SeqIO.write(record2,fastq2,"fastq") if i%100==0: print >>sys.stderr, "generate pairs %d\r"%(i), fastq1.close() fastq2.close() out.close() print time.time()-t0
def Main(): args=ParseArg() #store bed files with indexing and count information: bam={} print >>sys.stderr,"Starting index bam/bed files:" for i in range(len(args.bams)): temp_name=args.name[i] print >>sys.stderr," #Indexing for bam/bed file of",temp_name,"\r", bam[temp_name]=DBI.init(args.bams[i],args.fmt) print >>sys.stderr print >>sys.stderr,"Reading nucleosome peak xls file from Danpos." nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True) print >>sys.stderr,"Initial output files..." out=open(args.output,"w") # -- for verbose --- if args.verbose: out_mark=[] for n in args.name: out_mark.append(open(n+'_shift_nucleosomes.bed','w')) # ------------------ line_head=open(args.nucleosome,'r').readline().strip() line_head=line_head+"\t"+"\t".join(str(f) for f in args.name)+'\t'+"\t".join(str(f)+'_off' for f in args.name) print >>out,line_head print >>sys.stderr,"Start Counting..." num=0 t0 = time() for i in nucleosomes: chrom=i.chr if i.smt_pval>0.01 or i.fuzziness_pval>0.01: continue # only choose nucleosomes with high value and low fuzziness if "random" in chrom or chrom == 'chrM': continue num=num+1 center=int(i.start+i.end)/2 count=np.zeros(len(args.bams),dtype="float") offset=np.zeros(len(args.bams),dtype='int') line=str(i) for k,name in enumerate(args.name): if args.fmt=='bam': query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]),method='fetch') else: query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS])) read_centers=[] for j in query: read_centers.append(find_center(j,args.fmt)) [o,c]=getCount(read_centers,center) count[k]=c offset[k]=o # -- for verbose --- if args.verbose: print >>out_mark[k],chrom+'\t%d\t%d'%(i.start+o,i.end+o) # ------------------ line = line + "\t" + "\t".join(str(f) for f in count) + '\t' + "\t".join(str(f) for f in offset) if num%20000==0: t1 = time() print >>sys.stderr,"processing %dth nucleosome..., time: %.2fs."%(num,t1-t0),'\r', t0 = time() print >>out,line print out.close() # -- for verbose --- if args.verbose: for k in out_mark: k.close()
def Main(): global args,chrs,lengths,out args=ParseArg() if args.out=="stdout": out=sys.stdout else: try: out=open(args.out,"w") except IOError: print >>sys.stderr,"can't open file",args.out,"to write, using stdout instead" out=sys.stdout if args.bamlistA: dbi_A=DBI.init(args.bamlistA,"bamlist") else: dbi_A=DBI.init(args.bamA,"bamlist") if args.bamlistB: dbi_B=DBI.init(args.bamlistB,"bamlist") else: dbi_B=DBI.init(args.bamB,"bamlist") print_header() ''' Priority: Region > Annotations > chromSize ''' if args.region: ''' Query Only Region ''' i=parseRegion(args.region) for aps in QueryBed(i,dbi_A,dbi_B): print >>out,aps elif args.annotations: ''' Query Regions in Bed file or VCF file etc. ''' for i in TableIO.parse(args.annotations,args.annotation_format): for aps in QueryBed(i,dbi_A,dbi_B): print >>out,aps elif args.chromsize: ''' Query Whole Genome Chromsize File Example: chr1 249250621 chr2 243199373 chr3 198022430 . . . ''' for x in TableIO.parse(args.chromsize): (chr,size)=x binsize=1000000 chr=chr.strip() for i in xrange(0,size,binsize): start=i stop=i+binsize if stop>size: stop=size bed=Bed([chr,start,stop,".",".","."]) for aps in QueryBed(bed,dbi_A,dbi_B): print >>out,aps else: print >>sys.stderr," at least one of the options -r,-g,-a are required" exit(0)
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout db_format=args.db_format if len(db_format)==0: db_format=["bed" for i in range(len(args.db))] if len(db_format)==1: db_format=[db_format[0] for i in range(len(args.db))] if len(db_format)!=len(args.db): print >>sys.stderr,"the number of annotation files is not same with the number of annotation formats" print >>sys.stderr,"db format",db_format print >>sys.stderr,"db ",args.db exit(0) print >>out,"# Input:",args.input dbis=[] hits=[] #count the hits hcode={} for i,f in enumerate(args.db): print >>out,"# Features File No."+str(i+1),":",f dbis.append(DBI.init(f,db_format[i])) hits.append(0) query_num=0 for bed in TableIO.parse(args.input,args.input_format): if not args.m: print >>out,"QR\t",bed query_num+=1 code="@" for i,dbi in enumerate(dbis): flag=0 for hit in dbi.query(bed): if not args.m: print >>out,"\tDB"+str(i+1)+" HT\t",hit flag=1 hits[i]+=flag code+=str(flag) if hcode.has_key(code): hcode[code]+=1 else: hcode[code]=1 if not args.m: print >>out,"CD "+code,"\t",bed print >>out,"" print >>out,"" else: print >>out,bed,"\t","CD "+code for i,x in enumerate(hits): print >>out,"#",x,"/",query_num,"overlap with No."+str(i+1),args.db[i] for key in sorted(hcode.keys()): print >>out,"# code:"+key,"\t",hcode[key]
def Main(): """ IO TEMPLATE """ global args, out args = ParseArg() if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >>sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout """ END OF IO TEMPLATE """ print >> out, "# QUERY (VCF A):", args.VCF_A print >> out, "# DATA (VCF B):", args.VCF_B print >> out, "# A11 VCF in A and B, and alt nt is the same : A VCF entry" print >> out, "# B11 VCF in A and B, and alt nt is the same : B VCF entry" print >> out, "# A12 VCF position in A and B, and alt nt is not the same : A VCF entry" print >> out, "# B12 VCF position in A and B, and alt nt is not the same : B VCF entry" print >> out, "# A10 VCF, only exists in A" print >> out, "# B01 VCF, only exists in B" print >>sys.stderr, "Initialize data: reading ", args.VCF_A VCF_A_DBI = DBI.init(args.VCF_A, "vcf") print >>sys.stderr, "Initialize data: reading ", args.VCF_B VCF_B_DBI = DBI.init(args.VCF_B, "vcf") A11 = 0 A12 = 0 A10 = 0 B01 = 0 i0 = 0 print >>sys.stderr, "Query ", args.VCF_A for (x, i) in enumerate(VCF_A_DBI): if x % 1000 == 0: print >>sys.stderr, x, " entries\r", flag = 0 hit = None for j in VCF_B_DBI.query(i): if i == j: hit = j flag = 1 continue else: hit = j flag = 2 if flag == 1: print >> out, "A11_%08d\t" % A11, i print >> out, "B11_%08d\t" % A11, hit print >> out, "" A11 += 1 elif flag == 2: print >> out, "A12_%08d\t" % A12, i print >> out, "B12_%08d\t" % A12, hit print >> out, "" A12 += 1 else: print >> out, "A10_%08d\t" % A10, i print >> out, "" print >> out, "" A10 += 1 print >>sys.stderr, "Query ", args.VCF_B for (x, i) in enumerate(VCF_B_DBI): if x % 1000 == 0: print >>sys.stderr, x, " entries\r", flag = 0 for j in VCF_A_DBI.query(i): flag = 1 if flag == 0: print >> out, "B01_%08d\t" % B01, i print >> out, "" print >> out, "" B01 += 1 print >> out, "# [AB]11 number:", A11 print >> out, "# [AB]12 number:", A12 print >> out, "# A10 number:", A10 print >> out, "# B01 number:", B01
def Main(): t1 = time() global min_interaction, p_value args = ParseArg() inp = open(args.input, 'r') min_clusterS = args.min_clusterS min_interaction = args.min_interaction p_value = args.p_value output = open(args.output, 'w') ncpus = args.parallel #store genomic location of part1 and part2 part1 = [] part2 = [] k = 0 print >> sys.stderr, "# Inputing data..." chr_list = [] for line in inp.read().split('\n'): if line == '': continue line = line.strip().split('\t') p1 = annotated_bed(line[0:10], id=k) p2 = annotated_bed(line[11:], id=k) if isinstance(p1.start, list): p1.start = int(p1.start[0]) p1.end = int(p1.end[-1]) if isinstance(p2.start, list): p2.start = int(p2.start[0]) p2.end = int(p2.end[-1]) if SingleFragment(p1, p2): continue k += 1 part1.append(p1) part2.append(p2) if p1.chr not in chr_list: chr_list.append(p1.chr) if p2.chr not in chr_list: chr_list.append(p2.chr) if k % 20000 == 0: print >> sys.stderr, " Reading %d pairs of segments\r" % (k), print >> sys.stderr, "Get total %d pairs." % (k) if len(part1) != len(part2): print >> sys.stderr, "## ERROR: number of regions in two part not match!!" sys.exit(0) # sort in genomic order, easy for clustering part1 = sorted(part1, key=attrgetter('start')) part1 = sorted(part1, key=attrgetter('chr')) part2 = sorted(part2, key=attrgetter('start')) part2 = sorted(part2, key=attrgetter('chr')) # for parallel computing print >> sys.stderr, "# Generating clusters for two parts..." # tuple of all parallel python servers to connect with ppservers = () job_server = pp.Server(ncpus, ppservers=ppservers) jobs1 = [] jobs2 = [] for chro in chr_list: part1_temp = filter(lambda p: p.chr == chro, part1) if len(part1_temp) > 0: jobs1.append( job_server.submit(cluster_regions, (part1_temp, min_clusterS), (annotated_bed, ), ( "UnionFind", "copy", ))) part2_temp = filter(lambda p: p.chr == chro, part2) if len(part2_temp) > 0: jobs2.append( job_server.submit(cluster_regions, (part2_temp, min_clusterS), (annotated_bed, ), ( "UnionFind", "copy", ))) cluster_pool1 = {} part1 = [] for job in jobs1: try: part1 = part1 + job()[1] cluster_pool1.update(job()[0]) except: print >> sys.stderr, "Wrong in %s, part1" % (job()[2]) continue cluster_pool2 = {} part2 = [] for job in jobs2: try: part2 = part2 + job()[1] cluster_pool2.update(job()[0]) except: continue print >> sys.stderr, " cluster number for part1 is %d " % ( len(cluster_pool1)) print >> sys.stderr, " cluster number for part2 is %d " % ( len(cluster_pool2)) # sort back to pair two parts part1 = sorted(part1, key=attrgetter('id')) part2 = sorted(part2, key=attrgetter('id')) print >> sys.stderr, "size of part1&2:", len(part1), len(part2) c_interaction = {} for i in range(len(part1)): region1 = str(part1[i]) region2 = str(part2[i]) try: inter = part1[i].cluster + "--" + part2[i].cluster except: print >> sys.stderr, i, part1[i].cluster, part2[i].cluster sys.exit() if c_interaction.has_key(inter): c_interaction[inter] += 1 else: c_interaction[inter] = 1 # annotation file print >> sys.stderr, "# Indexing annotation files" dbi_all = DBI.init(args.annotation, "bed") dbi_detail = DBI.init(args.db_detail, "bed") dbi_repeat = DBI.init( "/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt", "bed") print >> sys.stderr, "# finding strong interactions from clusters..." k = 0 # record for strong interactions n = 0 # annotation file for interaction in c_interaction: n = n + 1 count = c_interaction[interaction] if count < min_interaction: continue i = interaction.split("--")[0] j = interaction.split("--")[1] try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1 = cluster_pool1[i].cluster count2 = cluster_pool2[j].cluster except: continue real_p = 1 - hypergeom.cdf(count, len(part1), count1, count2) if real_p <= p_value: k = k + 1 cluster_pool1[i].Annotate(dbi_all, dbi_detail, dbi_repeat) cluster_pool2[j].Annotate(dbi_all, dbi_detail, dbi_repeat) try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output, str(cluster_pool1[i]) + '\t' + str( cluster_pool2[j]) + '\t%d\t%.4f' % (count, log_p) if n % 1000 == 0: print >> sys.stderr, " Progress ( %d / %d )\r" % ( n, len(c_interaction)), print >> sys.stderr, "# Find %d strong interactions. Cost time: %.2f s" % ( k, time() - t1) if args.FDR: print >> sys.stderr, "# Permutated results:" for i in range(10): shuffle(part2) [n_r_I, n_r_SI] = Random_strongInteraction(part1, part2, cluster_pool1, cluster_pool2) print >> sys.stderr, " ", i, n_r_I, n_r_SI, n_r_SI * 1.0 / n_r_I
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() fin=IO.fopen(args.input,"r") out=IO.fopen(args.output,"w") ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) hSites={}; donorSites={}; acceptorSites={} if args.genome is not None: genome=DBI.init(args.genome,"genome") else: genome=None j=0 for j,i in enumerate(TableIO.parse(fin,"bam2bed12",references=fin.references,strand=args.strand)): #print >>out,i if j%1000==0: print >>sys.stderr,"processing ",j,"reads \r", for intron in i.Introns(): if len(intron)< args.intron_min_length: continue donor=intron.head(); #print >>sys.stderr,intron #print >>sys.stderr,donor donorID=bedToID(donor) if(donorSites.has_key(donorID)): donorSites[donorID]+=1 else: donorSites[donorID]=1 acceptor=intron.tail(); acceptorID=bedToID(acceptor) if(acceptorSites.has_key(acceptorID)): acceptorSites[acceptorID]+=1 else: acceptorSites[acceptorID]=1 ''' if genome is not None: s=genome.query(intron.head()).upper()+".."+genome.query(intron.tail()).upper() if hSites.has_key(s): hSites[s]+=1 else: hSites[s]=1 ''' donors=[] for key in donorSites.keys(): a=key.split("\t") donors.append(Bed([a[0],a[1],a[2],"noname_donor",donorSites[key],a[3]])); donors.sort() for i,x in enumerate(donors): x.id="donor_"+str(i) print >>out,x,"\t",genome.query(x).upper() acceptors=[] for key in acceptorSites.keys(): a=key.split("\t") acceptors.append(Bed([a[0],a[1],a[2],"noname_acceptor",acceptorSites[key],a[3]])); acceptors.sort() for i,x in enumerate(acceptors): x.id="acceptor_"+str(i) print >>out,x,"\t",genome.query(x).upper()
def Main(): args=ParseArg() pair_dist=args.pair_dist step=args.step print "\nChecking if linkedPair file is tabixed..." if not os.path.isfile(args.linkedPair): print "LinkedPair file is not exist, please check!!" sys.exit(0) if not os.path.isfile(args.linkedPair+".tbi"): print " tabix-ing..." os.system("sort -k1,1 -k2,2n "+args.linkedPair+" > temp_linkedPair.txt") os.system("bgzip temp_linkedPair.txt") os.system("tabix -p bed temp_linkedPair.txt.gz") linkedPair='temp_linkedPair.txt.gz' else: linkedPair=args.linkedPair print " linkedPair file is tabixed." print "\nTabixing the interaction file..." os.system("sort -k1,1 -k2,2n "+args.interaction+" > temp_interaction.txt") os.system("bgzip temp_interaction.txt") os.system("tabix -p bed temp_interaction.txt.gz") print " interaction file is tabixed." # start column number for second regions # s1 for interaction file and s2 for linkedPair file (s1,s2)=args.start print "\nGet region information." if args.r: Region = read_region(args.r) elif args.name: os.system('grep "%s" %s > temp2.txt'%(args.name,args.genebed)) g = open("temp2.txt").read().split('\t') if len(g)<2: print >> sys.stderr, "Error: the gene name is not found in database" sys.exit(0) s = int(g[1]) e = int(g[2]) Region = Bed([g[0],s-(e-s)/10,e+(e-s)/10,"region",".","."]) else: print >> sys.stderr, "Error: Need to specify the region by '-r' or specify the gene name by '-n'" sys.exit(0) print "\n Start plot heatmaps on region: "+Region.str_region() fig = plt.figure(figsize=(8,6)) ax = plt.subplot(111,frameon=False,yticks=[]) start = Region.start end = Region.stop ax.set_xlim(start,end) #set x ticks withour offset locs=ax.get_xticks() ax.set_xticklabels(map(lambda x: "%i"%x, locs),fontsize=6) print "\nStart draw gene track" gene_dbi=DBI.init(args.genebed,"bed") print " genebed indexed!" print " Plot gene track" gene_top=Genetrack(Region,gene_dbi,ax,0.08) h = 1.5*step/(end-start) # unit height for triangles or polycons in heatmap print "\nQuery linkedPairs within specified region" os.system("tabix "+linkedPair+" %s:%i-%i > temp2.txt"%(Region.chr,Region.start,Region.stop)) Count = {} for b in read_interaction("temp2.txt",s2): col='k' if args.Slim and SingleFragment(b[0],b[1],pair_dist): continue if Region.overlap(b[0],0) and Region.overlap(b[1],0): if b[0].strand=='-': i = b[0].start else: i = b[0].stop if b[1].strand=='-': j = b[1].start else: j = b[1].stop i = (i/step+1) * step # approximate to the nearest central point j = (j/step+1) * step if i > j: temp=j j=i i=temp if (i,j) not in Count: Count[(i,j)] = 1 else: Count[(i,j)] +=1 print Count patches = [] colors = [] for i in range(start,end+1): if i%step!=0: continue for j in range(i,end+1): if j%step!=0 or (i,j) not in Count: continue patches.append(PatchGen(i,j,h,step,gene_top+0.01)) colors.append(np.log(Count[(i,j)]+1)) p = PatchCollection(patches, cmap=matplotlib.cm.Reds, alpha=0.7, edgecolor='k',linewidths=0.1) p.set_array(np.array(colors)) ax.add_collection(p) ax.set_ylim(0,((end-start)/step+2)*h+gene_top+0.01) plt.colorbar(p) if not args.SI: plt.savefig(args.output) plt.show() os.system("rm temp_interaction.txt.gz*") if not os.path.isfile(args.linkedPair+".tbi"): os.system("rm temp_linkedPair.txt.gz*") os.system("rm temp2.txt") sys.exit(0) print "\nQuery interactions" os.system("tabix temp_interaction.txt.gz %s:%i-%i > temp2.txt"%(Region.chr,Region.start,Region.stop)) print "\nList of interactions plotted: " k=1 cmap=cm.get_cmap('Paired', 10) cmap=cmap(range(10)) bottom = gene_top+0.01 for b in read_interaction("temp2.txt",s1): if b[0].overlap(b[1],0): continue if Region.overlap(b[1],0): k+=1 if b[1].stop > b[0].stop: start1 = b[0].start end1 = b[0].stop start2 = b[1].start end2 = b[1].stop else: start1 = b[1].start end1 = b[1].stop start2 = b[0].start end2 = b[0].stop P1=Polygon([[start1,bottom],[end1,bottom],[(end1+end2)*0.5,(end2-end1)*h/step+bottom],[(start1+end2)*0.5,(end2-start1)*h/step+bottom]],"True",facecolor='none',edgecolor=cmap[k%10],alpha=0.4,lw=0.5) P2=Polygon([[start2,bottom],[end2,bottom],[(start1+end2)*0.5,(end2-start1)*h/step+bottom],[(start1+start2)*0.5,(start2-start1)*h/step+bottom]],"True",facecolor='none',edgecolor=cmap[k%10],alpha=0.4,lw=0.5) ax.add_patch(P1) ax.add_patch(P2) print " "+b[0].str_region()+" <-> "+b[1].str_region() plt.savefig(args.output) plt.show() # remove temp file os.system("rm temp_interaction.txt.gz*") if not os.path.isfile(args.linkedPair+".tbi"): os.system("rm temp_linkedPair.txt.gz*") os.system("rm temp2.txt")
def Main(): global args,out args=ParseArg() dict={} if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout argv=sys.argv argv[0]=argv[0].split("/")[-1] print >>out,"# This data was generated by program ",argv[0],"(version %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :\n#\t"," ".join(argv) init_dict={} if args.dbformat=="guess": if Tools.suffix(args.db)=="gz": args.dbformat="tabix" args.tabix_format=Tools.guess_format(args.db) else: args.dbformat=Tools.guess_format(args.db) if args.query_method: dict["method"]=args.query_method if args.tabix_format: init_dict["tabix"]=args.tabix_format dbi=DBI.init(args.db,args.dbformat,**init_dict) hits=0 query=0 if args.input=="stdin": input=sys.stdin else: input=args.input query_length=0 hits_number=0 if (args.input_format=="guess"): args.input_format=Tools.guess_format(args.input) for (i0,x) in enumerate(TableIO.parse(input,args.input_format)): if i0%100==0: print >>sys.stderr,"query ",i0," entries\r", print >>out,"QR\t",x hit=0 query+=1 query_length+=len(x) #print dbi;#debug results=dbi.query(x,**dict) #results=dbi.query(x) #DEBUG #print >>sys.stderr,type(results) if isinstance(results,numpy.ndarray) or isinstance(results,list): print >>out,"HT\t", for value in results: print >>out,str(value)+",", print >>out,"" hit=1 hits_number+=1 elif isinstance(results,str): print >>out,"HT\t", print >>out,results hit=1 hits_number+=1 else: for j in results: print >>out,"HT\t",j hit=1 hits_number+=1 if args.dbformat=="tabix": x.chr=x.chr.replace("chr","") for j in dbi.query(x,**dict): print >>out,"HT\t",j hit=1 hits_number+=1 hits+=hit print >>out,"# Query Number:",query,"\n# Query Have Hits:",hits print >>out,"# Query Length:",query_length print >>out,"# Hits Number:",hits_number
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout if args.input=="stdin": fin=sys.stdin else: try: x=args.input.split(".") if x[-1]=="gz": fin=gzip.open(args.input,"r") else: fin=open(args.input,"r") except IOError: print >>sys.stderr,"can't read file",args.input fin=sys.stdin ''' END OF IO TEMPLATE ''' print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION, print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )" print >>out,"# Date: ",time.asctime() print >>out,"# The command line is :" print >>out,"#\t"," ".join(sys.argv) gene=DBI.init(args.genetab,args.gene_format); upstream_list=[] downstream_list=[] exons_list=[] introns_list=[] utr3_list=[] utr5_list=[] for g in gene: upstream_list.append(g.upstream(args.upstream)); downstream_list.append(g.downstream(args.downstream)); for e in g.Exons(): exons_list.append(e) for i in g.Introns(): introns_list.append(i) if not (g.utr3() is None): utr3_list.append(g.utr3()) if not (g.utr5() is None): utr5_list.append(g.utr5()) upstream=DBI.init(upstream_list,"bed") downstream=DBI.init(downstream_list,"bed") exons=DBI.init(exons_list,"bed") introns=DBI.init(introns_list,"bed") utr3=DBI.init(utr3_list,"genebed") utr5=DBI.init(utr5_list,"genebed") print >>out,"#chr\tstart\tend\tname\tscore\tstrand\tgene\tupstream\tdownstream\texon\tintron\tutr3\tutr5" for i in TableIO.parse(fin,args.format): print >>out,i, print >>out,"\t",toIDs(gene.query(i)), print >>out,"\t",toIDs(upstream.query(i)), print >>out,"\t",toIDs(downstream.query(i)), print >>out,"\t",toIDs(exons.query(i)), print >>out,"\t",toIDs(introns.query(i)), print >>out,"\t",toIDs(utr3.query(i)), print >>out,"\t",toIDs(utr5.query(i))
def Main(): ''' IO TEMPLATE ''' global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout ''' END OF IO TEMPLATE ''' print >>out,"# QUERY (VCF A):",args.VCF_A print >>out,"# DATA (VCF B):",args.VCF_B print >>out,"# A11 VCF in A and B, and alt nt is the same : A VCF entry" print >>out,"# B11 VCF in A and B, and alt nt is the same : B VCF entry" print >>out,"# A12 VCF position in A and B, and alt nt is not the same : A VCF entry" print >>out,"# B12 VCF position in A and B, and alt nt is not the same : B VCF entry" print >>out,"# A10 VCF, only exists in A" print >>out,"# B01 VCF, only exists in B" print >>sys.stderr,"Initialize data: reading ",args.VCF_A VCF_A_DBI=DBI.init(args.VCF_A,"vcf") print >>sys.stderr,"Initialize data: reading ",args.VCF_B VCF_B_DBI=DBI.init(args.VCF_B,"vcf") A11=0 A12=0 A10=0 B01=0 i0=0 print >>sys.stderr,"Query ",args.VCF_A for (x,i) in enumerate(VCF_A_DBI): if x%1000==0: print >>sys.stderr,x," entries\r", flag=0 hit=None for j in VCF_B_DBI.query(i): if i==j: hit=j flag=1 continue else: hit=j flag=2 if flag==1: print >>out,"A11_%08d\t"%A11,i print >>out,"B11_%08d\t"%A11,hit print >>out,"" A11+=1 elif flag==2: print >>out,"A12_%08d\t"%A12,i print >>out,"B12_%08d\t"%A12,hit print >>out,"" A12+=1 else: print >>out,"A10_%08d\t"%A10,i print >>out,"" print >>out,"" A10+=1 print >>sys.stderr,"Query ",args.VCF_B for (x,i) in enumerate(VCF_B_DBI): if x%1000==0: print >>sys.stderr,x," entries\r", flag=0 for j in VCF_A_DBI.query(i): flag=1 if flag==0: print >>out,"B01_%08d\t"%B01,i print >>out,"" print >>out,"" B01+=1 print >>out,"# [AB]11 number:",A11 print >>out,"# [AB]12 number:",A12 print >>out,"# A10 number:",A10 print >>out,"# B01 number:",B01
def Main(): global args, out args = ParseArg() dict = {} if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout argv = sys.argv argv[0] = argv[0].split("/")[-1] print >> out, "# This data was generated by program ", argv[ 0], "(version %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :\n#\t", " ".join(argv) init_dict = {} if args.dbformat == "guess": if Tools.suffix(args.db) == "gz": args.dbformat = "tabix" args.tabix_format = Tools.guess_format(args.db) else: args.dbformat = Tools.guess_format(args.db) if args.query_method: dict["method"] = args.query_method if args.tabix_format: init_dict["tabix"] = args.tabix_format dbi = DBI.init(args.db, args.dbformat, **init_dict) hits = 0 query = 0 if args.input == "stdin": input = sys.stdin else: input = args.input query_length = 0 hits_number = 0 if (args.input_format == "guess"): args.input_format = Tools.guess_format(args.input) for (i0, x) in enumerate(TableIO.parse(input, args.input_format)): if i0 % 100 == 0: print >> sys.stderr, "query ", i0, " entries\r", print >> out, "QR\t", x hit = 0 query += 1 query_length += len(x) #print dbi;#debug results = dbi.query(x, **dict) #results=dbi.query(x) #DEBUG #print >>sys.stderr,type(results) if isinstance(results, numpy.ndarray) or isinstance(results, list): print >> out, "HT\t", for value in results: print >> out, str(value) + ",", print >> out, "" hit = 1 hits_number += 1 elif isinstance(results, str): print >> out, "HT\t", print >> out, results hit = 1 hits_number += 1 else: for j in results: print >> out, "HT\t", j hit = 1 hits_number += 1 if args.dbformat == "tabix": x.chr = x.chr.replace("chr", "") for j in dbi.query(x, **dict): print >> out, "HT\t", j hit = 1 hits_number += 1 hits += hit print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits print >> out, "# Query Length:", query_length print >> out, "# Hits Number:", hits_number
def Main(): t1 = time() global min_interaction, p_value args = ParseArg() inp = open(args.input, 'r') min_clusterS = args.min_clusterS min_interaction = args.min_interaction p_value = args.p_value output = open(args.output, 'w') ncpus = args.parallel #store genomic location of part1 and part2 part = [] k = 0 print >> sys.stderr, "# Inputing data..." chr_list = [] for line in inp.read().split('\n'): if line == '': continue line = line.strip().split('\t') p1 = annotated_bed(line[0:8], id=k, part=1) p2 = annotated_bed(line[9:], id=k, part=2) if SingleFragment(p1, p2): continue k += 1 part.append(p1) part.append(p2) if p1.chr not in chr_list: chr_list.append(p1.chr) if p2.chr not in chr_list: chr_list.append(p2.chr) if k % 20000 == 0: print >> sys.stderr, " Reading %d pairs of segments\r" % (k), print >> sys.stderr, "Get total %d pairs." % (k) # sort in genomic order, easy for clustering part = sorted(part, key=attrgetter('start')) part = sorted(part, key=attrgetter('chr')) # for parallel computing print >> sys.stderr, "# Generating clusters for two parts..." # tuple of all parallel python servers to connect with ppservers = () job_server = pp.Server(ncpus, ppservers=ppservers) jobs = [] for chro in chr_list: part_temp = filter(lambda p: p.chr == chro, part) if len(part_temp) > 0: jobs.append( job_server.submit(cluster_regions, (part_temp, min_clusterS), (annotated_bed, ), ( "UnionFind", "copy", ))) cluster_pool = {} part = [] for job in jobs: try: part = part + job()[1] cluster_pool.update(job()[0]) except: print >> sys.stderr, "Wrong in %s, part1" % (job()[2]) continue print >> sys.stderr, " cluster number is %d " % ( len(cluster_pool)) # sort back to pair two parts part = sorted(part, key=attrgetter('part')) part = sorted(part, key=attrgetter('id')) print >> sys.stderr, "size of part", len(part) c_interaction = {} i = 0 while i < len(part): P1 = part[i] P2 = part[i + 1] assert P1.id == P2.id i += 2 print >> sys.stderr, "%d\r" % (i), if P1.cluster == P2.cluster: continue if P1.cluster < P2.cluster: inter = P1.cluster + "--" + P2.cluster else: inter = P2.cluster + "--" + P1.cluster if c_interaction.has_key(inter): c_interaction[inter] += 1 else: c_interaction[inter] = 1 # annotation file print >> sys.stderr, "# Indexing annotation files" dbi_all = DBI.init(args.annotation, "bed") dbi_detail = DBI.init(args.db_detail, "bed") dbi_repeat = DBI.init( "/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt", "bed") print >> sys.stderr, "# finding strong interactions from clusters..." k = 0 # record for strong interactions n = 0 # annotation file for interaction in c_interaction: n = n + 1 count = c_interaction[interaction] if count < min_interaction: continue i = interaction.split("--")[0] j = interaction.split("--")[1] try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1 = cluster_pool[i].cluster count2 = cluster_pool[j].cluster except: continue real_p = 1 - hypergeom.cdf(count, len(part) / 2, count1, count2) if real_p <= p_value: k = k + 1 cluster_pool[i].Annotate(dbi_all, dbi_detail, dbi_repeat) cluster_pool[j].Annotate(dbi_all, dbi_detail, dbi_repeat) try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output, str(cluster_pool[i]) + '\t' + str( cluster_pool[j]) + '\t%d\t%.4f' % (count, log_p) if n % 1000 == 0: print >> sys.stderr, " Progress ( %d / %d )\r" % ( n, len(c_interaction)), print >> sys.stderr, "# Find %d strong interactions. Cost time: %.2f s" % ( k, time() - t1)
def Main(): global args, out args = ParseArg() dict = {} if args.output == "stdout": out = sys.stdout else: try: out = open(args.output, "w") except IOError: print >> sys.stderr, "can't open file ", args.output, "to write. Using stdout instead" out = sys.stdout argv = sys.argv argv[0] = argv[0].split("/")[-1] print >> out, "# This data was generated by program ", argv[ 0], "(version %s)" % VERSION, print >> out, "in bam2x ( https://github.com/nimezhu/bam2x )" print >> out, "# Date: ", time.asctime() print >> out, "# The command line is :\n#\t", " ".join(argv) if args.query_method: dict["method"] = args.query_method dbi = DBI.init(args.db, args.dbformat) hits = 0 query = 0 if args.input == "stdin": input = sys.stdin else: input = args.input query_length = 0 hits_number = 0 for (i0, x) in enumerate(TableIO.parse(input, args.input_format)): if i0 % 10 == 0: print >> sys.stderr, "query ", i0, " entries\r", print >> out, "QR\t", x hit = 0 query += 1 query_length += len(x) results = dbi.query(x, **dict) compatible = 0 #print >>sys.stderr,type(results) if isinstance(results, numpy.ndarray) or isinstance(results, list): if not args.silence: print >> out, "HT\t", for value in results: print >> out, str(value) + ",", print >> out, "" hit = 1 hits_number += 1 elif isinstance(results, str): if not args.silence: print >> out, "HT\t", print >> out, results hit = 1 hits_number += 1 else: this_query_hits = 0 for j in results: if not args.silence: print >> out, "HT\t", j, hit = 1 hits_number += 1 this_query_hits += 1 if isinstance(j, xplib.Annotation.Bed12) and isinstance( x, xplib.Annotation.Bed12): compatible_binary = Tools.compatible_with_transcript(j, x) if not args.silence: print >> out, "\tCompatible:", compatible_binary if compatible_binary: compatible += 1 else: if not args.silence: print >> out, "" print >> out, "HN\t", this_query_hits if compatible > 0: print >> out, "CP\t", compatible if args.dbformat == "tabix": x.chr = x.chr.replace("chr", "") for j in dbi.query(x, **dict): print >> out, "HT\t", j hit = 1 hits_number += 1 hits += hit print >> out, "# Query Number:", query, "\n# Query Have Hits:", hits print >> out, "# Query Length:", query_length print >> out, "# Hits Number:", hits_number