def summary(bwfile1, bwfile2, bwfile_add, bedfile, topnumber, out): total_result = [] p = BwIO(bwfile1) q = BwIO(bwfile2) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] bwHandle1 = BigWigFile(open(bwfile1, 'rb')) bwHandle2 = BigWigFile(open(bwfile2, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3] = "-" if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value1 = 0 else: mean_value1 = (summary.sum_data / summary.valid_count)[0] summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value2 = 0 else: mean_value2 = (summary.sum_data / summary.valid_count)[0] total_result.append(ll + [mean_value1 + mean_value2]) inf.close() total_result.sort(reverse=True, key=lambda x: x[-1]) bwHs = [] for i in bwfile_add: bwHs.append(BigWigFile(open(i, 'rb'))) outf = open(out, 'w') print "scaning 1st ", time.time() - t t = time.time() for i in range(min(len(total_result), topnumber)): ll = total_result[i] summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value1 = ",".join(map(str, list(summary.sum_data))) summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value2 = ",".join(map(str, list(summary.sum_data))) result = map(str, (ll + [additional_value1, additional_value2])) for bwH in bwHs: summary = bwH.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value_add = ",".join(map(str, list(summary.sum_data))) result.append(additional_value_add) outf.write("\t".join(result) + "\n") outf.close() print "scaning 2nd ", time.time() - t
def summary(bwfile,bedfile,topnumber,out): total_result = [] p=BwIO(bwfile) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle=BigWigFile(open(bwfile, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3]="-" if chrom_len.has_key(ll[0]): summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),1) if summary.valid_count == 0: mean_value = 0 else: mean_value = (summary.sum_data/summary.valid_count)[0] total_result.append(ll+[mean_value]) inf.close() total_result.sort(reverse=True,key=lambda x:x[-1]) outf = open(out,'w') print "scaning 1st ",time.time()-t t=time.time() for i in range(topnumber): ll = total_result[i] summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))) additional_value = ",".join(map(str,list(summary.sum_data))) result = map(str,(ll+[additional_value])) outf.write("\t".join(result)+"\n") outf.close() print "scaning 2nd ",time.time()-t
def get_signal(inputfile,output,signalbw,extend): signalbw = signalbw.strip().strip(',').split(',') p=BwIO(signalbw[0]) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle = [] for k in signalbw: bwHandle.append(BigWigFile(open(k, 'rb'))) inf = open(inputfile) outf = open(output,'w') for line in inf: ll = line.split() inputlen = len(ll) if not chrom_len.has_key(ll[0]): continue for bwH in bwHandle: S = (int(ll[1]) + int(ll[2]))/2 E = (int(ll[1]) + int(ll[2]))/2 + 1 try: signal=bwH.summarize(ll[0],max(0,S-extend),E+extend,1) except: break if float(signal.valid_count) == 0: ll.append('0') else: ll.append(str(float(signal.sum_data/signal.valid_count))) if len(ll) == ( inputlen + len(bwHandle) ): outf.write("\t".join(ll)+"\n") inf.close() outf.close()
def scan_fp(plusdnase, minusdnase, bed, out, upstream, downstream): p = BwIO(plusdnase) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle1 = BigWigFile(open(plusdnase, 'rb')) bwHandle2 = BigWigFile(open(minusdnase, 'rb')) inf = open(bed) outf = open(out, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue if int(ll[1]) < upstream: continue signal1 = bwHandle1.summarize( ll[0], int(ll[1]) - upstream, int(ll[2]) + downstream, (int(ll[2]) + downstream - int(ll[1]) + upstream)) signal2 = bwHandle2.summarize( ll[0], int(ll[1]) - upstream, int(ll[2]) + downstream, (int(ll[2]) + downstream - int(ll[1]) + upstream)) #ll.append(str(float(signal.sum_data))) newll = ll[:6] + map(str, list(signal1.sum_data)) + map( str, list(signal2.sum_data)) outf.write("\t".join(newll) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, vp, vm, dp, dm): p = BwIO(vp) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] vpBw = BigWigFile(open(vp, 'rb')) vmBw = BigWigFile(open(vm, 'rb')) dpBw = BigWigFile(open(dp, 'rb')) dmBw = BigWigFile(open(dm, 'rb')) inf = open(inputfile) outf = open(output, 'w') colnames = [ "chrom", "start", "end", "seq", "motifscore", "strand", "LncapARsignal", "LncapDNaseCutsite", "LncapDNaseFrag", "K562DNaseFrag", "LncapFP", "K562FP", "overARpeak", "VehPlus", "VehMinus", "DHTPlus", "DHTMinus" ] outf.write("\t".join(colnames) + "\n") for line in inf: if line.startswith("chrom"): continue ll = line.split() if not chrom_len.has_key(ll[0]): continue signal = vpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = vmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = dpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = dmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) outf.write("\t".join(ll) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, Pbw, Nbw, score_range): persudo = 0.2 p = BwIO(Pbw) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] PH = BigWigFile(open(Pbw, 'rb')) NH = BigWigFile(open(Nbw, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue motif_len = int(ll[2]) - int(ll[1]) Psignal = list( PH.summarize(ll[0], max(int(ll[1]) - 100, 0), int(ll[1]) + 100, 200).sum_data) Nsignal = list( NH.summarize(ll[0], max(int(ll[1]) - 100, 0), int(ll[1]) + 100, 200).sum_data) DNase = sum(Psignal) + sum(Nsignal) if ll[5] == '+': S_up_same = sum(Psignal[(100 - score_range):100]) S_up_diff = sum(Nsignal[(100 - score_range):100]) S_down_same = sum(Psignal[(100 + motif_len):100 + motif_len + score_range]) S_down_diff = sum(Nsignal[(100 + motif_len):100 + motif_len + score_range]) elif ll[5] == '-': S_up_same = sum(Nsignal[(100 + motif_len):100 + motif_len + score_range]) S_up_diff = sum(Psignal[(100 + motif_len):100 + motif_len + score_range]) S_down_same = sum(Nsignal[(100 - score_range):100]) S_down_diff = sum(Psignal[(100 - score_range):100]) else: print line sys.exit(1) # if S_up_same == 0 or S_up_diff ==0 or S_down_same == 0 or S_down_diff == 0: # continue FPscore1 = math.log((S_up_same + persudo) * (S_down_diff + persudo) / ((S_up_diff + persudo) * (S_down_same + persudo)), 2) FPscore2 = math.sqrt(S_up_same) + math.sqrt(S_down_diff) - math.sqrt( S_up_diff) - math.sqrt(S_down_same) ll.extend([DNase, FPscore1, FPscore2]) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, signalbw): p = BwIO(signalbw) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle = BigWigFile(open(signalbw, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue signal = bwHandle.summarize(ll[0], max(int(ll[1]) - 50, 0), int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) outf.write("\t".join(ll) + "\n") inf.close() outf.close()
def summary(bwfile, bedfile, out, central_max, central_min, flanking_max, flanking_min, cutoff): total_result = [] p = BwIO(bwfile) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle = BigWigFile(open(bwfile, 'rb')) inf = open(bedfile) outf = open(out, 'w') t = time.time() for line in inf: ll = line.split() if chrom_len.has_key(ll[0]): #t = time.time() summary = bwHandle.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) # print "bw sum time",time.time()-t # t=time.time() digital = list(summary.sum_data) # print "trans to list time",time.time()-t # t=time.time() FT = (caculate_footprint(digital, central_max, central_min, flanking_max, flanking_min, cutoff)) # print "scan footprint time",time.time()-t # time.time() for ft in FT: bed = "\t".join( map(str, [ ll[0], int(ll[1]) + ft[0], int(ll[1]) + ft[1], ll[3], ft[2] ])) + "\n" outf.write(bed) #print "single time",time.time()-t #print (int(ll[2])-int(ll[1]))#*1.0/(time.time()-t) inf.close() outf.close() print "scaning 1st ", time.time() - t
def sitepro_scan(pattern,peak,out,w_plus,w_minus,trunk): inf = open(pattern) pattern_plus = map(float,inf.readline().strip().split(",")) pattern_minus = map(float,inf.readline().strip().split(",")) all = sum(pattern_plus)+sum(pattern_minus) p_plus = [] p_minus= [] for i in pattern_plus: p_plus.append(i/all) for i in pattern_minus: p_minus.append(i/all) inf.close() l = len(pattern_plus) p0 = [1.0/(2*l)]*l inf = open(peak) p=BwIO(w_plus) q=BwIO(w_minus) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] w_plus_H=BigWigFile(open(w_plus, 'rb')) w_minus_H=BigWigFile(open(w_minus, 'rb')) footprint = [] count = 0 t=time.time() for line in inf: ll = line.split() if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): # print ll[0],int(ll[1])-l,int(ll[2])+l,(int(ll[2])-int(ll[1])+2*l) p_sum = list(w_plus_H.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))).sum_data) m_sum = list(w_minus_H.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))).sum_data) #print len(p_sum) last_start = "NA" last_end = "NA" last_value = "NA" for i in range(len(p_sum)-l): o_plus = map(float,p_sum[i:i+l]) o_minus = map(float,m_sum[i:i+l]) for k in range(len(o_plus)): if o_plus[k] > trunk: o_plus[k]=trunk if o_minus[k] > trunk: o_minus[k] = trunk #print pattern_plus,p0 score = match_pattern(p_plus,p_minus,p0,p0,o_plus,o_minus,l) if score == "NA": continue # print score#i,i+l,score,last_start,last_end,last_value if last_start == "NA" : last_start = i last_end = i+l last_value = score elif i >= last_end: footprint.append([ll[0],int(ll[1])+last_start+3,int(ll[1])+last_end-3,last_value]) last_start = i last_end = i+l last_value = score elif score > last_value: last_start = i last_end = i+l last_value = score footprint.append([ll[0],int(ll[1])+last_start+3,int(ll[1])+last_end-3,last_value]) if count%100 ==0: print time.time()-t t = time.time() count += 1 outf = open(out,'w') for fp in footprint: newline = "\t".join(map(str,fp))+"\n" outf.write(newline) outf.close()
def sitepro_scan(pattern,peak,out,w_plus,w_minus,trunk,text,w_chip): inf = open(pattern) pattern_plus_pmotif = map(float,inf.readline().strip().split(",")) pattern_minus_pmotif = map(float,inf.readline().strip().split(",")) pattern_plus_mmotif = map(float,inf.readline().strip().split(",")) pattern_minus_mmotif = map(float,inf.readline().strip().split(",")) all_sum_p = sum(pattern_plus_pmotif)+sum(pattern_minus_pmotif) all_sum_m = sum(pattern_plus_mmotif)+sum(pattern_minus_mmotif) p_plus = [] p_minus= [] m_plus = [] m_minus = [] for i in range(len(pattern_plus_pmotif)): p_plus.append(pattern_plus_pmotif[i]/all_sum_p) p_minus.append(pattern_minus_pmotif[i]/all_sum_p) m_plus.append(pattern_plus_mmotif[i]/all_sum_m) m_minus.append(pattern_minus_mmotif[i]/all_sum_m) inf.close() l = len(pattern_plus_pmotif) p0 = [1.0/(2*l)]*l inf = open(peak) p=BwIO(w_plus) q=BwIO(w_minus) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] w_plus_H=BigWigFile(open(w_plus, 'rb')) w_minus_H=BigWigFile(open(w_minus, 'rb')) w_chip_H=BigWigFile(open(w_chip, 'rb')) footprint = [] ls=[0]*2*len(pattern_plus_pmotif) for line in inf:### chr start end name motifscore strand FP DNase chip ll = line.split()##### 3 below is flanking length if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): DNase = float(w_plus_H.summarize(ll[0],(int(ll[1])+int(ll[2]))/2-50,(int(ll[1])+int(ll[2]))/2+50,1).sum_data) + float(w_minus_H.summarize(ll[0],(int(ll[1])+int(ll[2]))/2-50,(int(ll[1])+int(ll[2]))/2+50,1).sum_data) Chip = float(w_chip_H.summarize(ll[0],(int(ll[1])+int(ll[2]))/2-10,(int(ll[1])+int(ll[2]))/2+10,1).sum_data) p_sum = list(w_plus_H.summarize(ll[0],int(ll[1])-3-len(pattern_plus_pmotif),int(ll[2])+3+len(pattern_plus_pmotif),(int(ll[2])-int(ll[1])+2*(3+len(pattern_plus_pmotif)))).sum_data) m_sum = list(w_minus_H.summarize(ll[0],int(ll[1])-3-len(pattern_plus_pmotif),int(ll[2])+3+len(pattern_plus_pmotif),(int(ll[2])-int(ll[1])+2*(3+len(pattern_plus_pmotif)))).sum_data) last_start = "NA" last_end = "NA" last_value = "NA" for i in range(len(p_sum)-l): o_plus = map(int,p_sum[i:i+l]) o_minus = map(int,m_sum[i:i+l]) # for k in range(len(o_plus)): # if o_plus[k] > trunk: # o_plus[k]=trunk # if o_minus[k] > trunk: # o_minus[k] = trunk if ll[5]=="+": score = match_pattern(p_plus,p_minus,o_plus,o_minus,l,1) elif ll[5]=="-": score = match_pattern(m_plus,m_minus,o_plus,o_minus,l,1) if i == len(pattern_plus_pmotif): footprint.append(ll+[score,DNase,Chip]) if last_start == "NA" : last_start = i last_end = i+l last_value = score elif score > last_value: last_start = i last_end = i+l last_value = score if last_start ==0 and last_value ==0 : pass else: ls[last_start]+=1 outf = open(out,'w') for fp in footprint: newline = "\t".join(map(str,fp))+"\n" outf.write(newline) outf.close() outf = open(text,'w') outf.write("\t".join(map(str,ls))+"\n") outf.close()
def sitepro_scan(peak, out, w_plus, w_minus): inf = open(peak) p = BwIO(w_plus) q = BwIO(w_minus) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) footprint = [] for line in inf: ### chr start end name motifscore strand FP DNase chip ll = line.split() ##### 3 below is flanking length if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): DNase100p = sum( list( w_plus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 50, (int(ll[1]) + int(ll[2])) / 2 + 50, 2).sum_data)) DNase100m = sum( list( w_minus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 50, (int(ll[1]) + int(ll[2])) / 2 + 50, 2).sum_data)) DNase200p = sum( list( w_plus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 100, (int(ll[1]) + int(ll[2])) / 2 + 100, 2).sum_data)) DNase200m = sum( list( w_minus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 100, (int(ll[1]) + int(ll[2])) / 2 + 100, 2).sum_data)) DNase300p = sum( list( w_plus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 150, (int(ll[1]) + int(ll[2])) / 2 + 150, 2).sum_data)) DNase300m = sum( list( w_minus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 150, (int(ll[1]) + int(ll[2])) / 2 + 150, 2).sum_data)) DNase400p = sum( list( w_plus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 200, (int(ll[1]) + int(ll[2])) / 2 + 200, 2).sum_data)) DNase400m = sum( list( w_minus_H.summarize(ll[0], (int(ll[1]) + int(ll[2])) / 2 - 200, (int(ll[1]) + int(ll[2])) / 2 + 200, 2).sum_data)) # Chip = float(w_chip_H.summarize(ll[0],int(ll[1]),int(ll[2]),1).sum_data) p_sum = list( w_plus_H.summarize(ll[0], int(ll[1]) - 200, int(ll[1]) + 200, 400).sum_data) m_sum = list( w_minus_H.summarize(ll[0], int(ll[1]) - 200, int(ll[1]) + 200, 400).sum_data) footprint.append(ll + [ DNase100p, DNase100m, DNase200p, DNase200m, DNase300p, DNase300m, DNase400p, DNase400m ] + p_sum + m_sum) outf = open(out, 'w') for fp in footprint: newline = "\t".join(map(str, fp)) + "\n" outf.write(newline) outf.close()
def main(): usage = "usage: %prog <-r rfile> [options] <bigwig files> ..." description = "Draw correlation plot for many bigwig files. Based on qc_chIP_whole.py" optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False) optparser.add_option("-h","--help",action="help",help="Show this help message and exit.") #optparser.add_option("-d","--db",type="str",dest="dbname",help="UCSC db name for the assembly. Default: ce4",default="ce4") optparser.add_option("-r","--rfile",dest="rfile", help="R output file. If not set, do not save R file.") optparser.add_option("-s","--step",dest="step",type="int", help="sampling step in kbps. default: 100, minimal: 1",default=100) optparser.add_option("-z","--imgsize",dest="imgsize",type="int", help="image size in inches, note the PNG dpi is 72. default: 10, minimal: 10",default=10) optparser.add_option("-f","--format",dest="imgformat",type="string", help="image format. PDF or PNG",default='PDF') #optparser.add_option("-m","--method",dest="method",type="string",default="median", # help="method to process the paired two sets of data in the sampling step. Choices are 'median', 'mean', and 'sample' (just take one point out of a data set). Default: median") optparser.add_option("-l","--wig-label",dest="wiglabel",type="string",action="append", help="the wiggle file labels in the figure. No space is allowed. This option should be used same times as wiggle files, and please input them in the same order as -w option. default: will use the wiggle file filename as labels.") optparser.add_option("--min-score",dest="minscore",type="float",default=-10000, help="minimum score included in calculation. Points w/ score lower than this will be discarded.") optparser.add_option("--max-score",dest="maxscore",type="float",default=10000, help="maximum score included in calculation. Points w/ score larger than this will be discarded.") optparser.add_option("-H","--heatmap",dest="heatmap",action="store_true",default=False, help="If True, a heatmap image will be generated instead of paired scatterplot image.") (options,wigfiles) = optparser.parse_args() imgfmt = options.imgformat.upper() if imgfmt != 'PDF' and imgfmt != 'PNG': print "unrecognized format: %s" % imgfmt sys.exit(1) medfunc = mean wigfilenum = len(wigfiles) if wigfilenum < 2 or not options.rfile: error("must provide >=2 wiggle files") optparser.print_help() sys.exit(1) # wig labels if options.wiglabel and len(options.wiglabel) == wigfilenum: wiglabel = options.wiglabel else: # or use the filename wiglabel = map(lambda x:os.path.basename(x),wigfiles) if options.step < 1: error("Step can not be lower than 1!") sys.exit(1) if options.imgsize < 10: error("Image size can not be lower than 10!") sys.exit(1) # check the files for f in wigfiles: if not os.path.isfile(f): error("%s is not valid!" % f) sys.exit(1) info("number of bigwig files: %d" % wigfilenum) #get chromosome length from optins.wig[0]: p=BwIO(wigfiles[0]) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] # get the common chromosome list: chrset = set([t['key'] for t in p.chromosomeTree['nodes']]) for bw in wigfiles[1:]: p=BwIO(bw) chrset = chrset.intersection(set([t['key'] for t in p.chromosomeTree['nodes']])) chroms = list(chrset) if not chroms: error('No common chrom found') sys.exit() info("common chromosomes are %s." % ",".join(chroms)) # Start writing R file if options.rfile: rfhd = open(options.rfile,"w") rfhd.write('''require("RColorBrewer") ## from CRAN\n''') # for each wig file, sample... for i in range(len(wigfiles)): bw = BigWigFile(open(wigfiles[i],'rb')) info("read wiggle track from bigwig file #%d" % (i+1)) profile = [] for chrom in chroms: # The too-short chromosome will cause error in bw.summarize function below # So filter them out if chrom_len[chrom]/options.step/1000==0: warn("A very-short chromosome (%s) found and skipped"%chrom) continue summary = bw.summarize(chrom, 0, chrom_len[chrom], chrom_len[chrom]/options.step/1000) if not summary: continue profile_chr = summary.sum_data / summary.valid_count profile_chr = [str(t).replace('nan', 'NA') for t in profile_chr] profile.extend(profile_chr) info("write values to r file") rfhd.write("p%d <- c(%s)\n" %(i, ','.join(profile))) rfhd.write("c <- cbind(p0") for i in range(wigfilenum-1): rfhd.write(",p%d" % (i+1)) rfhd.write(")\n") rfhd.write("c <- c[ c[,1]<=%f & c[,1]>=%f " % (options.maxscore,options.minscore)) for i in range(wigfilenum-1): rfhd.write("& c[,%d]<=%f & c[,%d]>=%f " % (i+2,options.maxscore,i+2,options.minscore)) rfhd.write(",]\n") if imgfmt == 'PDF': rfhd.write("pdf(\"%s.pdf\",width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) elif imgfmt == 'PNG': rfhd.write("png(\"%s.png\",units=\"in\",res=150,width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize)) if options.heatmap: # heatmap rfhd.write('library(gplots)\n') rfhd.write(''' m <- cor(c, method="pearson", use="pairwise.complete.obs") ''') labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel)) rfhd.write("rownames(m) <- c(%s)\n" % labels) rfhd.write("colnames(m) <- c(%s)\n" % labels) rfhd.write('# draw the heatmap using gplots heatmap.2\n') rfhd.write('mn <- -1\n') rfhd.write('mx <- 1\n') rfhd.write('n <- 98\n') rfhd.write('bias <- 1\n') rfhd.write('mc <- matrix(as.character(round(m, 2)), ncol=dim(m)[2])\n') rfhd.write('breaks <- seq(mn, mx, (mx-mn)/(n))\n') rfhd.write('cr <- colorRampPalette(colors = c("#2927FF","#FFFFFF","#DF5C5C"), bias=bias)\n') rfhd.write('heatmap.2(m, col = cr(n), breaks=breaks, trace="none", cellnote=mc, notecol="black", notecex=1.8, keysize=0.5, density.info="histogram", margins=c(27.0,27.0), cexRow=2.20, cexCol=2.20, revC=T, symm=T)\n') else: # scatterplot rfhd.write(''' panel.plot <- function( x,y, ... ) { par(new=TRUE) m <- cbind(x,y) plot(m,col=densCols(m),pch=20) lines(lowess(m[!is.na(m[,1])&!is.na(m[,2]),]),col="red") } panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- cor(x, y,use="complete.obs") txt <- format(round(r,2),width=5,nsmall=2) #format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) #text(0.5, 0.5, txt, cex = cex.cor * abs(r)) text(0.5, 0.5, txt, cex = cex.cor) } ''') labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel)) rfhd.write(''' pairs(c, lower.panel=panel.plot, upper.panel=panel.cor, labels=c(%s)) ''' % (labels)) rfhd.write("dev.off()\n") rfhd.close() # try to call R try: subprocess.call(['Rscript',options.rfile]) except: info("Please check %s" % options.rfile) else: info("Please check %s" % (options.rfile+'.'+imgfmt))
def sitepro_scan(peak, out, w_plus, w_minus): inf = open(peak) p = BwIO(w_plus) q = BwIO(w_minus) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) footprint = [] outf = open(out, 'w') for line in inf: ### chr start end name motifscore strand FP DNase chip ll = line.split() ##### 3 below is flanking length try: START = int(ll[1]) except: print 'start:', ll[1], line continue try: END = int(ll[2]) except: print 'end:', ll[2], line continue if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): try: DNase100p = sum( list( w_plus_H.summarize(ll[0], (START + END) / 2 - 50, (START + END) / 2 + 50, 2).sum_data)) except: #print ll[0],START,END,(START+END)/2-50,(START+END)/2+50,type(START),type(END),line DNase100p = sum( list( w_plus_H.summarize(ll[0], (START + END) / 2 - 50, (START + END) / 2 + 50, 1).sum_data)) try: DNase100m = sum( list( w_minus_H.summarize(ll[0], (START + END) / 2 - 50, (START + END) / 2 + 50, 2).sum_data)) except: DNase100m = sum( list( w_minus_H.summarize(ll[0], (START + END) / 2 - 50, (START + END) / 2 + 50, 1).sum_data)) DNase200p = sum( list( w_plus_H.summarize(ll[0], (START + END) / 2 - 100, (START + END) / 2 + 100, 2).sum_data)) DNase200m = sum( list( w_minus_H.summarize(ll[0], (START + END) / 2 - 100, (START + END) / 2 + 100, 2).sum_data)) DNase300p = sum( list( w_plus_H.summarize(ll[0], (START + END) / 2 - 150, (START + END) / 2 + 150, 2).sum_data)) DNase300m = sum( list( w_minus_H.summarize(ll[0], (START + END) / 2 - 150, (START + END) / 2 + 150, 2).sum_data)) DNase400p = sum( list( w_plus_H.summarize(ll[0], (START + END) / 2 - 200, (START + END) / 2 + 200, 2).sum_data)) DNase400m = sum( list( w_minus_H.summarize(ll[0], (START + END) / 2 - 200, (START + END) / 2 + 200, 2).sum_data)) # Chip = float(w_chip_H.summarize(ll[0],START,END,1).sum_data) p_sum = list( w_plus_H.summarize(ll[0], START - 200, START + 200, 400).sum_data) m_sum = list( w_minus_H.summarize(ll[0], START - 200, START + 200, 400).sum_data) fpline = (ll + [ DNase100p, DNase100m, DNase200p, DNase200m, DNase300p, DNase300m, DNase400p, DNase400m ] + p_sum + m_sum) newline = "\t".join(map(str, fpline)) + "\n" outf.write(newline) outf.close()
def sitepro_scan(pattern, peak, out, w_plus, w_minus, trunk, text): inf = open(pattern) pattern_plus = map(float, inf.readline().strip().split(",")) pattern_minus = map(float, inf.readline().strip().split(",")) all_sum = sum(pattern_plus) + sum(pattern_minus) p_plus = [] p_minus = [] for i in pattern_plus: p_plus.append(i / all_sum) for i in pattern_minus: p_minus.append(i / all_sum) inf.close() l = len(pattern_plus) p0 = [1.0 / (2 * l)] * l inf = open(peak) p = BwIO(w_plus) q = BwIO(w_minus) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) footprint = [] # count = 0 # t=time.time() ls = [0] * 44 for line in inf: # s=[] ll = line.split() if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): p_sum = list( w_plus_H.summarize( ll[0], int(ll[1]) - 3 - 22, int(ll[2]) + 3 + 22, (int(ll[2]) - int(ll[1]) + 6 + 44)).sum_data) m_sum = list( w_minus_H.summarize( ll[0], int(ll[1]) - 3 - 22, int(ll[2]) + 3 + 22, (int(ll[2]) - int(ll[1]) + 6 + 44)).sum_data) last_start = "NA" last_end = "NA" last_value = "NA" for i in range(len(p_sum) - l): o_plus = map(int, p_sum[i:i + l]) o_minus = map(int, m_sum[i:i + l]) for k in range(len(o_plus)): if o_plus[k] > trunk: o_plus[k] = trunk if o_minus[k] > trunk: o_minus[k] = trunk # o_sum = sum(o_plus)+sum(o_minus) #print pattern_plus,p0 score = match_pattern(p_plus, p_minus, p0, p0, o_plus, o_minus, l) # s.append(score) if i == 22: footprint.append(ll + [score]) if last_start == "NA": last_start = i last_end = i + l last_value = score elif score > last_value: last_start = i last_end = i + l last_value = score if last_start == 0 and last_value == 0: pass else: ls[last_start] += 1 #footprint.append(ll+[]) # if count%100 ==0: # print time.time()-t # print ls # t = time.time() # count += 1 outf = open(out, 'w') for fp in footprint: newline = "\t".join(map(str, fp)) + "\n" outf.write(newline) outf.close() outf = open(text, 'w') outf.write("\t".join(map(str, ls)) + "\n") outf.close()
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,pspan,fetch_length=100,gen='hg19'): p=BwIO(pcut) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] pcutbw = BigWigFile(open(pcut, 'rb')) ncutbw = BigWigFile(open(ncut, 'rb')) inf = open(inputfile) pp=[] pm=[] X = c.interval(genome=gen) X.chrom,X.start,X.end,X.val = [],[],[],[] pBG,nBG = readBG(BGmatrix) for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue pout = make_cut(pcutbw,ll,pspan,fetch_length) nout = make_cut(ncutbw,ll,pspan,fetch_length) if ll[5] == "-": pout,nout = nout,pout if pout == 'NA': continue #print len(pout),len(nout),ll[:3] pp.append(pout) pm.append(nout) X.chrom.append(ll[0]) X.start.append(int(ll[1])-pspan -3 + 1) X.end.append(int(ll[2]) + pspan +3 + 1) X.val.append(ll[5]) #total[ ( flength - span ) : ( flength + int(ll[2]) - int(ll[1]) + span ) ] meanp = apply_mean(pp) meanm = apply_mean(pm) X.getSequence() pbglist = [] nbglist = [] for i,elem in enumerate(X.seq): seq = X.seq[i] strand = X.val[i] if 'N' in seq.upper(): continue pseq = seq[:-1] nseq = seq[1:] #if 'N' in pseq or 'N' in nseq: # continue p=[] n=[] for k in range(len(pseq) +1 - 6): p.append(pBG[pseq[k:k+6].upper()]) n.append(nBG[nseq[k:k+6].upper()]) if strand != '-': pbglist.append(p) nbglist.append(n) else: pbglist.append(n[::-1]) nbglist.append(p[::-1]) #print nbglist meanpbglist = apply_mean(pbglist) meanmbglist = apply_mean(nbglist) plot_template(meanp,meanm,meanpbglist,meanmbglist,outputfile)