def summarize(self, interval, bins=None, method='summarize', function='mean'): # We may be dividing by zero in some cases, which raises a warning in # NumPy based on the IEEE 754 standard (see # http://docs.scipy.org/doc/numpy/reference/generated/ # numpy.seterr.html) # # That's OK -- we're expecting that to happen sometimes. So temporarily # disable this error reporting for the duration of this method. orig = np.geterr()['invalid'] np.seterr(invalid='ignore') if (bins is None) or (method == 'get_as_array'): bw = BigWigFile(open(self.fn)) s = bw.get_as_array( interval.chrom, interval.start, interval.stop, ) if s is None: s = np.zeros((interval.stop - interval.start, )) else: s[np.isnan(s)] = 0 elif method == 'ucsc_summarize': if function in ['mean', 'min', 'max', 'std', 'coverage']: return self.ucsc_summarize(interval, bins, function=function) else: raise ValueError('function "%s" not supported by UCSC\'s' 'bigWigSummary') else: bw = BigWigFile(open(self.fn)) s = bw.summarize(interval.chrom, interval.start, interval.stop, bins) if s is None: s = np.zeros((bins, )) else: if function == 'sum': s = s.sum_data if function == 'mean': s = s.sum_data / s.valid_count s[np.isnan(s)] = 0 if function == 'min': s = s.min_val s[np.isinf(s)] = 0 if function == 'max': s = s.max_val s[np.isinf(s)] = 0 if function == 'std': s = (s.sum_squares / s.valid_count) s[np.isnan(s)] = 0 # Reset NumPy error reporting np.seterr(divide=orig) return s
def scan_fp(plusdnase, minusdnase, bed, out, upstream, downstream): p = BwIO(plusdnase) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle1 = BigWigFile(open(plusdnase, 'rb')) bwHandle2 = BigWigFile(open(minusdnase, 'rb')) inf = open(bed) outf = open(out, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue if int(ll[1]) < upstream: continue signal1 = bwHandle1.summarize( ll[0], int(ll[1]) - upstream, int(ll[2]) + downstream, (int(ll[2]) + downstream - int(ll[1]) + upstream)) signal2 = bwHandle2.summarize( ll[0], int(ll[1]) - upstream, int(ll[2]) + downstream, (int(ll[2]) + downstream - int(ll[1]) + upstream)) #ll.append(str(float(signal.sum_data))) newll = ll[:6] + map(str, list(signal1.sum_data)) + map( str, list(signal2.sum_data)) outf.write("\t".join(newll) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, vp, vm, dp, dm): p = BwIO(vp) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] vpBw = BigWigFile(open(vp, 'rb')) vmBw = BigWigFile(open(vm, 'rb')) dpBw = BigWigFile(open(dp, 'rb')) dmBw = BigWigFile(open(dm, 'rb')) inf = open(inputfile) outf = open(output, 'w') colnames = [ "chrom", "start", "end", "seq", "motifscore", "strand", "LncapARsignal", "LncapDNaseCutsite", "LncapDNaseFrag", "K562DNaseFrag", "LncapFP", "K562FP", "overARpeak", "VehPlus", "VehMinus", "DHTPlus", "DHTMinus" ] outf.write("\t".join(colnames) + "\n") for line in inf: if line.startswith("chrom"): continue ll = line.split() if not chrom_len.has_key(ll[0]): continue signal = vpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = vmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = dpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) signal = dmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1) ll.append(str(float(signal.sum_data))) outf.write("\t".join(ll) + "\n") inf.close() outf.close()
def getsignal(inputfile,outputfile,pcut,DHT,Veh,pspan): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] pcutbw = BigWigFile(open(pcut, 'rb')) dht = BigWigFile(open(DHT, 'rb')) veh = BigWigFile(open(Veh, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) inf.seek(0) outf = open(outputfile,'w') for line in inf: ll = line.split() # if not chrom_len.has_key(ll[0]): # continue cut = list(pcutbw.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2*pspan).sum_data) TC = sum(cut) C = sum(cut[(pspan-ml/2) : (pspan-ml/2+ml)]) L = sum(cut[(pspan-ml/2-ml):(pspan-ml/2)]) R = sum(cut[(pspan-ml/2+ml):(pspan-ml/2+2*ml)]) FOS = -1*( (C+1)/(R+1) + (C+1)/(L+1) ) dhtnum = sum(list(dht.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2).sum_data)) + 1 vehnum = sum(list(veh.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2).sum_data)) + 1 newll = ll + [TC,FOS,dhtnum,vehnum] outf.write("\t".join(map(str,newll))+"\n") outf.close()
def get_regionLevel_simplex_parameters(inputbed, outputbed, plusbw, minusbw, biasmat, ext, genome2bit): simplex_code = encoding() biasdict, flank = readBG(biasmat) B, B0, B1, B2 = paramest(biasdict) permuteSeq = {} inf = open("permuteSeq8mer.txt") for line in inf: ll = line.split() permuteSeq[ll[0]] = ll[1] inf.close() # outitem = seq2biasParm("ACTCGCAA",B,simplex_code) #print B genome = twobitreader.TwoBitFile(genome2bit) # seq = genome[chrm][(int(ll[1])-flank):(int(ll[1])+flank)].upper() plusBWH = BigWigFile(open(plusbw, 'rb')) minusBWH = BigWigFile(open(minusbw, 'rb')) inf = open(inputbed) outf = open(outputbed, 'w') for line in inf: ll = line.split() chrm = ll[0] center = (int(ll[1]) + int(ll[2])) / 2 start = max(0, center - ext) end = center + ext plusSig = plusBWH.summarize(ll[0], start, end, end - start).sum_data minusSig = minusBWH.summarize(ll[0], start, end, end - start).sum_data if type(plusSig) == None or type(minusSig) == None: continue plusSequence = genome[chrm][(start - flank):(end + flank)].upper() minusSequence = genome[chrm][(start - flank + 1):(end + flank + 1)].upper() plus_data = numpy.array([0.0] * len(B)) minus_data = numpy.array([0.0] * len(B)) for i in range(len(plusSig)): #position = start + i pcuts = plusSig[i] if pcuts > 0: pseq = plusSequence[i:(i + 2 * flank)].upper() if not "N" in pseq: p_out = seq2biasParm(permuteSeq[pseq], B, simplex_code) plus_data += pcuts * p_out for i in range(len(minusSig)): #position = start + i mcuts = minusSig[i] if mcuts > 0: tmpseq = minusSequence[i:(i + 2 * flank)] if not "N" in tmpseq: mseq = revcomp(tmpseq).upper() m_out = seq2biasParm(permuteSeq[mseq], B, simplex_code) minus_data += mcuts * m_out newll = ll + list(plus_data) + list(minus_data) outf.write("\t".join(map(str, newll)) + "\n") inf.close() outf.close()
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,Ipcut,Incut,pspan,tspan,gen,left,right,fetch_length=100): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] genome = twobitreader.TwoBitFile(gen) pcutbw = BigWigFile(open(pcut, 'rb')) ncutbw = BigWigFile(open(ncut, 'rb')) Ipcutbw = BigWigFile(open(Ipcut, 'rb')) Incutbw = BigWigFile(open(Incut, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) pspan = pspan - ml/2 inf.seek(0) pBG,nBG = readBG(BGmatrix) outf = open(outputfile,'w') for line in inf: ll = line.split() chrom = ll[0] start = int(ll[1]) end = int(ll[2]) strand = ll[5] seq = genome[chrom][(start-pspan-left):(end + pspan+right)] pout = make_cut(pcutbw,ll,pspan,fetch_length) nout = make_cut(ncutbw,ll,pspan,fetch_length) Ipout = make_cut(Ipcutbw,ll,pspan,fetch_length) Inout = make_cut(Incutbw,ll,pspan,fetch_length) if strand == "-": pout,nout = nout,pout Ipout,Inout = Inout,Ipout if pout == 'NA': continue if 'N' in seq.upper(): continue #print 1 pseq = seq[:-1] nseq = seq[1:] p=[] n=[] for k in range(len(pseq) +1 - left-right): p.append(pBG[pseq[k:k+left+right].upper()]) n.append(nBG[nseq[k:k+left+right].upper()]) if strand != '-': pbglist = p nbglist = n else: pbglist = n[::-1] nbglist = p[::-1] TC,FOS = makeTCFOS(pcutbw,ncutbw,ll,tspan,ml) newll = ll + [TC,FOS] + pout + nout + Ipout + Inout + pbglist + nbglist outf.write("\t".join(map(str,newll))+"\n") outf.close() inf.close()
def get_signal(inputfile, output, bwfiles, bwfolder, extend): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/'): bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/'): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue #center = (int(ll[1]) + int(ll[2]))/2 #S = max(0,center - extend) #E = center + extend #C = (int(ll[1]) + int(ll[2]) ) /2 #S = C - extend #E = C + extend S = int(ll[1]) E = int(ll[2]) for bwHandle in bwHs: try: signal1 = (bwHandle.summarize(ll[0], max(0, S - extend), S, 20)) signal2 = (bwHandle.summarize(ll[0], S, E, 20)) signal3 = (bwHandle.summarize(ll[0], E, E + extend, 20)) binlen1 = extend * 1.0 / 20 binlen2 = (E - S) * 1.0 / 20 binlen3 = extend * 1.0 / 20 if type(signal1.sum_data) == None or type( signal2.sum_data) == None or type( signal3.sum_data) == None: addsig = [0] * 60 else: addsig1 = signal1.sum_data / binlen1 #float(signal.sum_data/signal.valid_count) addsig2 = signal2.sum_data / binlen2 addsig3 = signal3.sum_data / binlen3 addsig = list(addsig1) + list(addsig2) + list(addsig3) except: #print 'c2',line addsig = [0] * 60 #'nan' # ll.extend(list(signal.sum_data/signal.valid_count)) if len(ll) >= 6 and ll[5] == "-": ll.extend(addsig[::-1]) else: ll.extend(addsig) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def count_cut_nmers(fp, w_plus, w_minus, lflank, rflank, single_nmer_cutoff, sequence): """ count the number of cuts associated with each nmer in sequence covered by X. offset is the position of the cut to be associated with each nmer. if offset = 0 the first base of the tag is lined up with the nmer start """ w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) genome = twobitreader.TwoBitFile(sequence) # keep count of the number of occurrences of each n-mer seq_nmer_dict = {} cut_nmer_dict = {} for line in fp.readlines(): ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) seq = genome[chrm][(start - lflank):(end + rflank)].upper() cp = list(w_plus_H.summarize(ll[0], start, end, end - start).sum_data) cn = list(w_minus_H.summarize(ll[0], start, end, end - start).sum_data) #each = (len(ll)-5)/2 #cp = (map(float,ll[5:(5+each)])) #cn = (map(float,ll[(5+each):(5+each*2)])) for k in range(len(cp)): p_cut = cp[k] n_cut = cn[k] p_seq = seq[k:(k + lflank + rflank)] n_seq = seq[(k + 1):(k + lflank + rflank + 1)] # rev_n_seq = rev(n_seq) if 'N' not in p_seq and p_cut <= single_nmer_cutoff: try: cut_nmer_dict[p_seq] += p_cut except: cut_nmer_dict[p_seq] = p_cut try: seq_nmer_dict[p_seq] += 1 except: seq_nmer_dict[p_seq] = 1 if 'N' not in n_seq and n_cut <= single_nmer_cutoff: rev_n_seq = rev(n_seq) try: cut_nmer_dict[rev_n_seq] += n_cut except: cut_nmer_dict[rev_n_seq] = n_cut try: seq_nmer_dict[rev_n_seq] += 1 except: seq_nmer_dict[rev_n_seq] = 1 return seq_nmer_dict, cut_nmer_dict
def get_signal(inputfile, output, bwfiles, extend, N, bwfolder): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/') and not bwfolder != "": bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/') or startswith("./") or startswith("../"): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue if len(ll) >= 6 and ll[5] == "-": start = int(ll[2]) strand_flap = 1 else: start = int(ll[1]) strand_flap = 0 S = max(0, start - extend) E = start + extend # S = int(ll[1]) # E = int(ll[2]) outdata = ll for bwHandle in bwHs: try: signal = (bwHandle.summarize(ll[0], S, E, N)) binlen = (E - S) * 1.0 / N if type(signal.sum_data) == None: print 'c1', line addsig = ["na"] * N else: addsig = list( signal.sum_data * 1.0 / (binlen)) #float(signal.sum_data/signal.valid_count) except: print 'c2', line addsig = ["na"] * N #'nan' # ll.extend(list(signal.sum_data/signal.valid_count)) if strand_flap == 1: ll.extend(addsig[::-1]) else: ll.extend(addsig) # ll.extend(list(signal.sum_data/signal.valid_count)) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def sitepro_scan(peak, outname, w_plus, w_minus, Cspan): inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outf_propPlus = open(outname + "_propcutPlus.bdg", 'w') outf_propMinus = open(outname + "_propcutMinus.bdg", 'w') for line in inf: ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) if start - Cspan < 0: print ll continue plus_obj = w_plus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) minus_obj = w_minus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) if not plus_obj: plus_vector = numpy.array([0] * (end - start + 2 * Cspan)) + 1 else: plus_vector = plus_obj.sum_data + 1 if not minus_obj: minus_vector = numpy.array([0] * (end - start + 2 * Cspan)) + 1 else: minus_vector = minus_obj.sum_data + 1 roundN = 4 #### assign bias to bp and proportion for outpos in range(Cspan, (end - start + Cspan)): this_plus_cuts_prop = round( plus_vector[outpos] / sum(plus_vector[(outpos - Cspan):(outpos + Cspan)]), roundN) this_minus_cuts_prop = round( minus_vector[outpos] / sum(minus_vector[(outpos - Cspan):(outpos + Cspan)]), roundN) out_chrm = chrm out_start = start + outpos - Cspan out_end = out_start + 1 outf_propPlus.write("\t".join( map(str, [out_chrm, out_start, out_end, this_plus_cuts_prop])) + "\n") outf_propMinus.write("\t".join( map(str, [out_chrm, out_start, out_end, this_minus_cuts_prop])) + "\n") outf_propPlus.close() outf_propMinus.close() inf.close()
def sitepro_scan(peak, outp, outn, w_plus, w_minus, bgmatrix, span, gen, lflank, rflank): nmer = lflank + rflank genome = twobitreader.TwoBitFile(gen) pBG, nBG = readBG(bgmatrix) inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outfp = open(outp, 'w') outfn = open(outn, 'w') for line in inf: ### chr start end name motifscore strand FP DNase chip ll = line.split() ##### 3 below is flanking length chrm = ll[0] start = int(ll[1]) end = int(ll[2]) ## remove overflow if start - span - lflank <= 0: continue ## get cleavage p_sum = list( w_plus_H.summarize(chrm, start - span, end + span, end - start + 2 * span).sum_data) n_sum = list( w_minus_H.summarize(chrm, start - span, end + span, end - start + 2 * span).sum_data) ## get seqbias seq = genome[chrm][(start - span - lflank):(end + span + rflank)] if 'N' in seq.upper(): continue pseq = seq[:-1] nseq = seq[1:] p = [] n = [] ### bias for k in range(len(pseq) + 1 - nmer): p.append(pBG[pseq[k:(k + nmer)].upper()]) n.append(nBG[nseq[k:(k + nmer)].upper()]) for bp in range(len(p_sum) - 2 * span): ptotal = sum(p_sum[bp:(bp + 2 * span)]) ### total ntotal = sum(n_sum[bp:(bp + 2 * span)]) pc = int(p_sum[bp + span]) #### observation cut nc = int(n_sum[bp + span]) pbias = p[bp + span] nbias = n[bp + span] pbgtotal = sum(p[bp:(bp + span * 2)]) nbgtotal = sum(n[bp:(bp + span * 2)]) paraw = (pbias / pbgtotal) * ptotal naraw = (nbias / nbgtotal) * ntotal outfp.write("\t".join(map(str, [pc, ptotal, pbias, paraw])) + "\n") outfn.write("\t".join(map(str, [nc, ntotal, nbias, paraw])) + "\n") outfp.close() outfn.close() inf.close()
def summary(bwfile1, bwfile2, bwfile_add, bedfile, topnumber, out): total_result = [] p = BwIO(bwfile1) q = BwIO(bwfile2) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] bwHandle1 = BigWigFile(open(bwfile1, 'rb')) bwHandle2 = BigWigFile(open(bwfile2, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3] = "-" if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value1 = 0 else: mean_value1 = (summary.sum_data / summary.valid_count)[0] summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value2 = 0 else: mean_value2 = (summary.sum_data / summary.valid_count)[0] total_result.append(ll + [mean_value1 + mean_value2]) inf.close() total_result.sort(reverse=True, key=lambda x: x[-1]) bwHs = [] for i in bwfile_add: bwHs.append(BigWigFile(open(i, 'rb'))) outf = open(out, 'w') print "scaning 1st ", time.time() - t t = time.time() for i in range(min(len(total_result), topnumber)): ll = total_result[i] summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value1 = ",".join(map(str, list(summary.sum_data))) summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value2 = ",".join(map(str, list(summary.sum_data))) result = map(str, (ll + [additional_value1, additional_value2])) for bwH in bwHs: summary = bwH.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value_add = ",".join(map(str, list(summary.sum_data))) result.append(additional_value_add) outf.write("\t".join(result) + "\n") outf.close() print "scaning 2nd ", time.time() - t
def Main(): global args args = ParseArg() bw1 = BigWigFile(open(args.percentile1)) bw2 = BigWigFile(open(args.percentile2)) gout = WriteToFile(args.output + ".list") perc_array1 = [] perc_array2 = [] for line in ReadFromFile(args.geneList): row = line.strip().split() gene = row[0] chrom = row[1] start = int(row[2]) end = int(row[3]) array1 = bw1.get_as_array(chrom, start, end) array2 = bw2.get_as_array(chrom, start, end) if array1 is not None and array2 is not None: perc1 = np.mean(array1) + 50 perc2 = np.mean(array2) + 50 print >> gout, '%s\t%s\t%d\t%d\t%f\t%f' % (gene, chrom, start, end, perc1, perc2) perc_array1.append(perc1) perc_array2.append(perc2) '''scatter plot''' sns.set() plt.scatter(perc_array1, perc_array2, marker=',', color='black', s=1, alpha=0.1) plt.axes().set_aspect('equal') plt.xlabel(args.x, fontsize=20) plt.ylabel(args.y, fontsize=20) plt.ylim(0, 100) plt.xlim(0, 100) plt.tick_params(axis='both', which='major', labelsize=20, width=2) plt.gca().set_yticks([0, 20, 40, 60, 80, 100]) plt.gca().set_xticks([0, 20, 40, 60, 80, 100]) x1, y1 = [0, 89.5], [10.5, 100] x2, y2 = [10.5, 100 ], [0, 89.5 ] # draw lines showing the threshold to call changed domains plt.gca().spines['left'].set_linewidth(2) plt.gca().spines['bottom'].set_linewidth(2) plt.gca().spines['right'].set_linewidth(2) plt.gca().spines['top'].set_linewidth(2) plt.subplots_adjust(bottom=.2, left=.2) plt.plot(x1, y1, linewidth=0.5, linestyle='--', color='red') plt.plot(x2, y2, linewidth=0.5, linestyle='--', color='red') plt.savefig(args.output + '_dot.eps', format='eps') plt.close() logging("DONE!!!")
def get_signal(inputfile, output, bwfiles, bwfolder, extend, N): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/'): bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/'): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue #center = (int(ll[1]) + int(ll[2]))/2 #S = max(0,center - extend) #E = center + extend C = (int(ll[1]) + int(ll[2])) / 2 #if len(ll)>=6 and ll[5] == "-": # C = int(ll[1]) #else: # C = int(ll[2]) S = max(0, C - extend) E = C + extend for bwHandle in bwHs: try: signal = (bwHandle.summarize(ll[0], S, E, N)) binlen = extend * 2.0 / N if type( signal.sum_data ) == None: #or type(signal2.sum_data) == None or type(signal3.sum_data) == None: addsig = [0] * N else: addsig_tmp = signal.sum_data / binlen #float(signal.sum_data/signal.valid_count) addsig = list(addsig_tmp) #+ list(addsig2) + list(addsig3) except: #print 'c2',line addsig = [0] * N #'nan' # ll.extend(list(signal.sum_data/signal.valid_count)) if len(ll) >= 6 and ll[5] == "-": ll.extend(addsig[::-1]) else: ll.extend(addsig) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, Pbw, Nbw, score_range): persudo = 0.2 p = BwIO(Pbw) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] PH = BigWigFile(open(Pbw, 'rb')) NH = BigWigFile(open(Nbw, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue motif_len = int(ll[2]) - int(ll[1]) Psignal = list( PH.summarize(ll[0], max(int(ll[1]) - 100, 0), int(ll[1]) + 100, 200).sum_data) Nsignal = list( NH.summarize(ll[0], max(int(ll[1]) - 100, 0), int(ll[1]) + 100, 200).sum_data) DNase = sum(Psignal) + sum(Nsignal) if ll[5] == '+': S_up_same = sum(Psignal[(100 - score_range):100]) S_up_diff = sum(Nsignal[(100 - score_range):100]) S_down_same = sum(Psignal[(100 + motif_len):100 + motif_len + score_range]) S_down_diff = sum(Nsignal[(100 + motif_len):100 + motif_len + score_range]) elif ll[5] == '-': S_up_same = sum(Nsignal[(100 + motif_len):100 + motif_len + score_range]) S_up_diff = sum(Psignal[(100 + motif_len):100 + motif_len + score_range]) S_down_same = sum(Nsignal[(100 - score_range):100]) S_down_diff = sum(Psignal[(100 - score_range):100]) else: print line sys.exit(1) # if S_up_same == 0 or S_up_diff ==0 or S_down_same == 0 or S_down_diff == 0: # continue FPscore1 = math.log((S_up_same + persudo) * (S_down_diff + persudo) / ((S_up_diff + persudo) * (S_down_same + persudo)), 2) FPscore2 = math.sqrt(S_up_same) + math.sqrt(S_down_diff) - math.sqrt( S_up_diff) - math.sqrt(S_down_same) ll.extend([DNase, FPscore1, FPscore2]) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, bwfiles, bwfolder, extend): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "" if not bwfolder.endswith('/'): bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/'): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue if len(ll) >= 6 and ll[5] == "-": strand_flap = 1 else: strand_flap = 0 # center = (int(ll[1]) + int(ll[2]))/2 # S = max(0,center - extend) # E = center + extend S = int(ll[1]) E = int(ll[2]) outdata = [] for bwHandle in bwHs: try: signal = (bwHandle.summarize(ll[0], S, E, (E - S))) if signal: thisdata_tmp = list(signal.sum_data) # if strand_flap == 1: # thisdata = map(round,thisdata_tmp,[4]*(E-S))[::-1] # else: thisdata = map(round, thisdata_tmp, [4] * (E - S)) else: thisdata = ["NA"] * (E - S) except: thisdata = ["NA"] * (E - S) outdata.append(thisdata) # ll.extend(list(signal.sum_data/signal.valid_count)) for pos in range(len(outdata[0])): newll = [ll[0], S + pos, S + pos + 1] for dataorder in range(len(outdata)): newll.append(outdata[dataorder][pos]) outf.write("\t".join(map(str, newll)) + "\n") inf.close() outf.close()
def sitepro_scan(peak, outname, w_plus, w_minus, Cspan): inf = open(peak) w_plus_H = BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) outf = open(outname + "_Cuts.txt", 'w') for line in inf: ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) if start - Cspan < 0: print ll continue plus_obj = w_plus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) minus_obj = w_minus_H.summarize(chrm, start - Cspan, end + Cspan, (end - start + 2 * Cspan)) if not plus_obj: plus_vector = numpy.array([0] * (end - start + 2 * Cspan)) else: plus_vector = plus_obj.sum_data if not minus_obj: minus_vector = numpy.array([0] * (end - start + 2 * Cspan)) else: minus_vector = minus_obj.sum_data #roundN = 4 #### assign bias to bp and proportion for outpos in range(Cspan, (end - start + Cspan)): this_plus = plus_vector[outpos] this_minus = minus_vector[outpos] this_plus_cuts_sum = sum(plus_vector[(outpos - Cspan):(outpos + Cspan)]) this_minus_cuts_sum = sum(minus_vector[(outpos - Cspan):(outpos + Cspan)]) out_chrm = chrm out_start = start + outpos - Cspan out_end = out_start + 1 outf.write("\t".join( map(str, [ out_chrm + ":" + str(out_start) + "-" + str(out_end), this_plus, this_plus_cuts_sum, this_minus, this_minus_cuts_sum ])) + "\n") outf.close() inf.close()
def make_template(data, flank, pflank, topmotif, out, pbw, mbw): w_plus_H = BigWigFile(open(pbw, 'rb')) w_minus_H = BigWigFile(open(mbw, 'rb')) i = 0 templatelist = [] pp = [] pm = [] inf = open(data) l1st = inf.readline().split() ml = int(l1st[2]) - int(l1st[1]) inf.seek(0) for line in inf: #if i >= topmotif: # break ll = line.split() templatelist.append(ll) inf.close() templatelist.sort(key=lambda x: float(x[4]), reverse=True) for ll in templatelist: p_sum = list( w_plus_H.summarize(ll[0], int(ll[1]) - flank, int(ll[1]) + flank, 2 * flank).sum_data) m_sum = list( w_minus_H.summarize(ll[0], int(ll[1]) - flank, int(ll[1]) + flank, 2 * flank).sum_data) if ll[5] == "+": pp.append(p_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 + pflank)]) pm.append(m_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 + pflank)]) if ll[5] == '-': pm.append(p_sum[::-1][(flank + 1 + ml / 2 - ml - pflank):(flank + 1 + ml / 2 - ml + pflank)]) pp.append(m_sum[::-1][(flank + 1 + ml / 2 - ml - pflank):(flank + 1 + ml / 2 - ml + pflank)]) meanp = apply_mean(pp) meanm = apply_mean(pm) allsum = sum(meanp) + sum(meanm) P = [] M = [] for i in range(len(meanp)): P.append(meanp[i]) #/allsum) M.append(meanm[i]) #/allsum) plot_template(P, M, out)
def get_signal(inputfile, output, plusBW, minusBW, bwfolder, extend): if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/'): bwfolder += '/' plus = BigWigFile(open(bwfolder + plusBW, 'rb')) minus = BigWigFile(open(bwfolder + minusBW, 'rb')) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue if len(ll) >= 6 and ll[5] == "-": strand_flap = 1 else: strand_flap = 0 start = int(ll[1]) end = int(ll[2]) S = max(0, start - extend) E = end + extend # S = int(ll[1]) # E = int(ll[2]) outdata = ll try: plus_signal = (plus.summarize(ll[0], S, E, (E - S))) minus_signal = (minus.summarize(ll[0], S, E, (E - S))) if plus_signal and minus_signal: plus_tmp = list(plus_signal.sum_data) minus_tmp = list(minus_signal.sum_data) if strand_flap == 1: thisdata_tmp = minus_tmp[:: -1] + plus_tmp[:: -1] #map(round,thisdata_tmp,[4]*(E-S))[::-1] else: thisdata_tmp = plus_tmp + minus_tmp thisdata = thisdata_tmp #map(round,thisdata_tmp,[4]*len(thisdata_tmp)) except: pass outdata.extend(thisdata) # ll.extend(list(signal.sum_data/signal.valid_count)) outf.write("\t".join(map(str, outdata)) + "\n") inf.close() outf.close()
def main(): p = optparse.OptionParser(__doc__) p.add_option('-A', '--absolute', action='store_true',dest='A',\ default=False, help='absolute threshold') p.add_option('-s','--standard_background', action='store_true',\ dest='stdbg') p.add_option('-D', '--debug', action='store_true', dest='debug') options, args = p.parse_args() debug_c = 0 BEDFILE = open(args[0], 'rU') BW = BigWigFile(file=open(args[1])) BEDout = open(args[2], 'w') for line in BEDFILE: print(line) line = line.strip().split('\t') x = BW.query(line[0], int(line[1]), int(line[2]),1) line.append(str(round(x[0]['mean'], 5))) BEDout.write("\t".join(line)+"\n") """ for i in x: print i['mean'] """ if options.debug: debug_c +=1 if debug_c >= 10: break if __name__ == '__main__': main()
def test_summaries_from_file(): bw = BigWigFile(file=open("test_data/bbi_tests/test.bw", 'rb')) def check_summary(line): fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) n = int(fields[3]) t = fields[4] values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]] sd = bw.summarize(chrom, start, end, n) if t == 'mean': print(sd.sum_data / sd.valid_count) print(values) assert allclose(sd.sum_data / sd.valid_count, values) elif t == 'min': assert allclose(sd.min_val, values) elif t == 'max': assert allclose(sd.max_val, values) # elif t == 'std': # assert numpy.allclose( sd.max_val, values ) for i, line in enumerate(open("test_data/bbi_tests/test.expectation")): f = partial(check_summary, line) f.description = "Test summaries line %d: %s" % (i, line[:40]) yield (f, )
def summary(bwfile,bedfile,topnumber,out): total_result = [] p=BwIO(bwfile) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle=BigWigFile(open(bwfile, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3]="-" if chrom_len.has_key(ll[0]): summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),1) if summary.valid_count == 0: mean_value = 0 else: mean_value = (summary.sum_data/summary.valid_count)[0] total_result.append(ll+[mean_value]) inf.close() total_result.sort(reverse=True,key=lambda x:x[-1]) outf = open(out,'w') print "scaning 1st ",time.time()-t t=time.time() for i in range(topnumber): ll = total_result[i] summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))) additional_value = ",".join(map(str,list(summary.sum_data))) result = map(str,(ll+[additional_value])) outf.write("\t".join(result)+"\n") outf.close() print "scaning 2nd ",time.time()-t
def Readbw(bwfile,chrm,start,end,n): bwHandle=BigWigFile(open(bwfile, 'rb')) summary = bwHandle.summarize(chrm,int(start),int(end),(int(end)-int(start))/n) count = map(sudocount,summary.valid_count) sum = summary.sum_data scores = list(sum/count) return scores
def test_summaries_from_file(): bw = BigWigFile(file=open("test_data/bbi_tests/test.bw")) def check_summary(line): fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) n = int(fields[3]) t = fields[4] values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]] sd = bw.summarize(chrom, start, end, n) if t == 'mean': print sd.sum_data / sd.valid_count print values assert allclose(sd.sum_data / sd.valid_count, values) elif t == 'min': assert allclose(sd.min_val, values) elif t == 'max': assert allclose(sd.max_val, values) #elif t == 'std': # assert numpy.allclose( sd.max_val, values ) for line in open("test_data/bbi_tests/test.expectation"): yield check_summary, line
def refine_with_summit(_soft,_mark,_tissue): _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/top_bed/{0}.{1}.{2}.bed"\ .format(_soft,_mark,_tissue))] _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.rep0.bw".format(_mark,_tissue)) _temp_enrich = open("/Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') _bw = BigWigFile(file=_temp_bw) for line in _temp_peak: vals = _bw.get(line[0],int(line[1]),int(line[2])) vals =tuple(vals) if len(vals)>0: maxs = 0 for _key in vals: if float(_key[2])>maxs: maxs = float(_key[2]) summit = _key[:2] summit_p=int((float(summit[0])+float(summit[1]))/2) if summit_p-1000>0: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999)) else: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000) _temp_enrich.close() sh('sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\ >/Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed'.format(_soft,_mark,_tissue)) sh('bash ../get_enrich.sh /Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed {1} {2} {3}'\ .format(_soft,_mark,_tissue,_soft))
def getChromatinDataSeries(bigwigFile, libraryTable, sgInfoTable, tssTable, colname = '', naValue = 0): bwindex = BigWigFile(open(bigwigFile)) chromDict = tssTable['chromosome'].to_dict() chromatinScores = [] for name, sgInfo in sgInfoTable.iterrows(): geneTup = (sgInfo['gene_name'],','.join(sgInfo['transcript_list'])) if geneTup not in chromDict: #negative controls chromatinScores.append(np.nan) continue if sgInfo['strand'] == '+': sgRange = sgInfo['pam coordinate'] + sgInfo['length'] else: sgRange = sgInfo['pam coordinate'] - sgInfo['length'] chrom = chromDict[geneTup] chromatinArray = bwindex.get_as_array(chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange)) if chromatinArray is not None and len(chromatinArray) > 0: chromatinScores.append(np.nanmean(chromatinArray)) else: #often chrY when using K562 data.. # print name # print chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange) chromatinScores.append(np.nan) chromatinSeries = pd.Series(chromatinScores, index=libraryTable.index, name = colname) return chromatinSeries.fillna(naValue)
def get_signal(inputfile,output,signalbw,extend): signalbw = signalbw.strip().strip(',').split(',') p=BwIO(signalbw[0]) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle = [] for k in signalbw: bwHandle.append(BigWigFile(open(k, 'rb'))) inf = open(inputfile) outf = open(output,'w') for line in inf: ll = line.split() inputlen = len(ll) if not chrom_len.has_key(ll[0]): continue for bwH in bwHandle: S = (int(ll[1]) + int(ll[2]))/2 E = (int(ll[1]) + int(ll[2]))/2 + 1 try: signal=bwH.summarize(ll[0],max(0,S-extend),E+extend,1) except: break if float(signal.valid_count) == 0: ll.append('0') else: ll.append(str(float(signal.sum_data/signal.valid_count))) if len(ll) == ( inputlen + len(bwHandle) ): outf.write("\t".join(ll)+"\n") inf.close() outf.close()
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options): # keep record which fragment has decent mappability mappable = np.zeros((fragmentCount, ), dtype=np.float) # lazy load from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile(open(bwfile)) for fragmentId in fragmentsMap.keys(): (chrom, start, end) = fragmentsMap[fragmentId] if (options.vverbose): print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end) try: mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"] if (np.isnan(mappable[fragmentId])): mappable[fragmentId] = 0 except: mappable[fragmentId] = 0. # problem with invalid values if (options.vverbose): print >> sys.stderr, "Problem with bw file at %s %d-%d" % ( chrom, start, end) print traceback.format_exc() return mappable
def load_annos(args): """ Populate a dictionary of Tabixfile handles for each annotation file. Other modules can then access a given handle and fetch data from it as follows: dbsnp_handle = annotations.annos['dbsnp'] hits = dbsnp_handle.fetch(chrom, start, end) """ anno_files = get_anno_files(args) for anno in anno_files: try: # .gz denotes Tabix files. if anno_files[anno].endswith(".gz"): annos[anno] = pysam.Tabixfile(anno_files[anno]) # .bw denotes BigWig files. elif anno_files[anno].endswith(".bw"): annos[anno] = BigWigFile(open(anno_files[anno])) except IOError: sys.exit("Gemini cannot open this annotation file: %s. \n" "Have you installed the annotation files? If so, " "have they been moved or deleted? Exiting...\n\n" "For more details:\n\t" "http://gemini.readthedocs.org/en/latest/content/" "#installation.html\#installing-annotation-files\n" % anno_files[anno])
def get_mean_phastcons(bedtool, phastcons_location, sample_size=1000): """ Get means phastcons scores for all intervals in a bed tool bedtool - bedtool to extract data from phastcons_location - location of phastcons file """ with open(phastcons_location) as bw_file: bw = BigWigFile(bw_file) data = [] for bedline in bedtool.random_subset(min(len(bedtool), sample_size)): conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop) try: if len(conservation_values) > 0: mean_phastcons = np.mean(conservation_values) else: mean_phastcons = 0 data.append(mean_phastcons) except TypeError: pass return data