def makeseq(chromlen, binlength, species): x = cis.interval(genome=species) x.chrom = [] x.start = [] x.end = [] # x.strand = [] inf = open(chromlen) #open("/mnt/data/static_libraries/chromLen/hg19.len") #outf = open("hg19_%s_bin.bed"%(binlength),'w') for line in inf: start = 1 end = int(binlength) while end < int(line.split()[1]): # print start,end #outf.write("\t".join([line.split()[0],str(start),str(end)])+"\n") x.chrom.append(line.split()[0]) x.start.append(max(1, start - 100)) #print line x.end.append(end + 100) # x.end.append(min(end+100,int(line.split()[1]))) start += int(binlength) end += int(binlength) x.chrom.append(line.split()[0]) x.start.append(start) x.end.append(int(line.split()[1])) x.getSequence() return x
def makeseq(bedfile,species): x = cis.interval(genome=species) x.chrom = [] x.start = [] x.end = [] # x.strand = [] inf = open(bedfile)#open("/mnt/data/static_libraries/chromLen/hg19.len") #outf = open("hg19_%s_bin.bed"%(binlength),'w') for line in inf: #start = 1 #end = int(binlength) #while end < int(line.split()[1]): # print start,end #outf.write("\t".join([line.split()[0],str(start),str(end)])+"\n") x.chrom.append(line.split()[0]) x.start.append(int(line.split()[1])) x.end.append(int(line.split()[2])) # start += int(binlength) # end += int(binlength) #x.chrom.append(line.split()[0]) #x.start.append(start) #x.end.append(int(line.split()[1])) x.getSequence() return x
def getsignal(inputfile, outputfile, BGmatrix, pcut, ncut, pspan, fetch_length=100, gen='hg19'): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] pcutbw = BigWigFile(open(pcut, 'rb')) ncutbw = BigWigFile(open(ncut, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) pspan = pspan - ml / 2 inf.seek(0) X = c.interval(genome=gen) X.chrom, X.start, X.end, X.val = [], [], [], [] pBG, nBG = readBG(BGmatrix) for line in inf: ll = line.split() # if not chrom_len.has_key(ll[0]): # continue X.chrom.append(ll[0]) X.start.append(int(ll[1]) - pspan - 3 + 1) X.end.append(int(ll[2]) + pspan + 3 + 1) X.val.append(ll[5]) inf.close() X.getSequence() outf = open(outputfile, 'w') for i, elem in enumerate(X.seq): pchrm = X.chrom[i] pstart = X.start[i] - 1 + 3 + pspan pend = X.end[i] - 1 - 3 - pspan seq = X.seq[i] strand = X.val[i] pll = [pchrm, pstart, pend, strand] pout = make_cut(pcutbw, pll, pspan, fetch_length) nout = make_cut(ncutbw, pll, pspan, fetch_length) if pll[3] == "-": pout, nout = nout, pout if pout == 'NA': continue if 'N' in seq.upper(): continue #print 1 pseq = seq[:-1] nseq = seq[1:] p = [] n = [] for k in range(len(pseq) + 1 - 6): p.append(pBG[pseq[k:k + 6].upper()]) n.append(nBG[nseq[k:k + 6].upper()]) if strand != '-': pbglist = p nbglist = n else: pbglist = n[::-1] nbglist = p[::-1] #print nbglist newll = [pchrm, pstart, pend, strand] + pout + nout + pbglist + nbglist #print len(pout),len(nout),len(pbglist),len(nbglist),len(newll) outf.write("\t".join(map(str, newll)) + "\n") outf.close()
def profile(inputfile,outputfile,pattern,strand): x = cis.interval(genome='hg19') x.chrom=[] x.start=[] x.end=[] inf = open(inputfile) for line in inf: ll = line.split() x.chrom.append(ll[0]) x.start.append(int(ll[1])-30) x.end.append(int(ll[1])+2+31) x.getSequence() scores = [] for i in range(len(x.start)): score = [] s = string.upper(x.seq[i]) if strand == "+": for j in range(len(s)-2): if s[j] == "A": p1=0 elif s[j] == "C": p1=1 elif s[j] == "G": p1=2 elif s[j] == "T": p1=3 else: break if s[j+1] == "A": p2=0 elif s[j+1] == "C": p2=1 elif s[j+1] == "G": p2=2 elif s[j+1] == "T": p2=3 else: break prob = pattern[p1][p2] score.append(prob) elif strand == "-": for j in range(1,len(s)-1): if s[j+1]=="A": p1=3 elif s[j+1] == "C": p1=2 elif s[j+1] == "G": p1=1 elif s[j+1] == "T": p1=0 else: break if s[j] == "A": p2=3 elif s[j] == "C": p2=2 elif s[j] == "G": p2=1 elif s[j] == "T": p2=0 else: break prob = pattern[p1][p2] score.append(prob) scores.append(score) inf.close() outf = open(outputfile,'w') for score in scores: if len(score)!=61: continue outf.write("\t".join(map(str,score))+"\n") outf.close()
def profile(inputfile, outputfile, mode, strand): A1 = [0] * 4 ##[ACGT] C1 = [0] * 4 G1 = [0] * 4 T1 = [0] * 4 x = cis.interval(genome='hg19') x.chrom = [] x.start = [] x.end = [] inf = open(inputfile) for line in inf: ll = line.split() x.chrom.append(ll[0]) if mode == "peak": x.start.append(int(ll[1])) x.end.append(int(ll[2]) + 2) elif mode == "cut": if strand == "+": x.start.append(int(ll[1])) x.end.append(int(ll[1]) + 2) else: x.start.append(int(ll[2])) x.end.append(int(ll[2]) + 2) else: print "mode wrong, only peak,cut availabe" exit() x.getSequence() for i in range(len(x.start)): s = string.upper(x.seq[i]) if strand == "+": for j in range(len(s) - 1): if s[j + 1] == "A": p1 = 0 elif s[j + 1] == "C": p1 = 1 elif s[j + 1] == "G": p1 = 2 elif s[j + 1] == "T": p1 = 3 else: continue if s[j] == "A": A1[p1] += 1 elif s[j] == "C": C1[p1] += 1 elif s[j] == "G": G1[p1] += 1 elif s[j] == "T": T1[p1] += 1 else: continue elif strand == "-": s = s[::-1] for j in range(len(s) - 1): if s[j + 1] == "A": p1 = 3 elif s[j + 1] == "C": p1 = 2 elif s[j + 1] == "G": p1 = 1 elif s[j + 1] == "T": p1 = 0 else: continue if s[j] == "A": T1[p1] += 1 elif s[j] == "C": G1[p1] += 1 elif s[j] == "G": C1[p1] += 1 elif s[j] == "T": A1[p1] += 1 else: continue else: print "strand only + and - " exit() inf.close() outf = open(outputfile, 'w') # outf.write("\t".join(['P','A','C','G','T'])+"\n") # outf.write("\t".join(map(str,['A']+A1))+"\n") # outf.write("\t".join(map(str,['C']+C1))+"\n") # outf.write("\t".join(map(str,['G']+G1))+"\n") # outf.write("\t".join(map(str,['T']+T1))+"\n") outf.write("\t".join(map(str, A1)) + "\n") outf.write("\t".join(map(str, C1)) + "\n") outf.write("\t".join(map(str, G1)) + "\n") outf.write("\t".join(map(str, T1)) + "\n") # outf.write("total\t"+str(sum(A1)+sum(C1)+sum(G1)+sum(T1))+"\n") outf.close()
def read_file(fp, mintags=0, maxtags=10, maxlines=100, select=100): """ Parse data """ # TODO read in more general format if 1: CHR, START, END, NAME, SCORE, STRAND, COUNT50P, COUNT50M = 0, 1, 2, 3, 4, 5, 6, 7 CUT_START = 14 CUT_END = 414 else: CHR, START, END, NAME, SCORE, COUNT50P, COUNT50M = 0, 1, 2, 3, 4, 5, 6 STRAND = None CUT_START = 13 CUT_END = 413 k = 0 X = c.interval(genome='hg19') X.chrom, X.start, X.end, X.strand, X.name, X.val = [], [], [], [], [], [] cuts_pos = [] cuts_neg = [] mid = int(0.5 * (CUT_END - CUT_START)) startoffset = (mid - int(0.5 * select)) endoffset = (mid - int(0.5 * select) + select) for elem in fp.readlines(): if elem[0:3] == 'chr': f = elem.split() chr, start, end, seq, motifscore, n50p, n50m = f[CHR], int( f[START]), int(f[END]), f[NAME], float(f[SCORE]), float( f[COUNT50P]), float(f[COUNT50M]) if STRAND: strand = f[STRAND] else: strand = '+' if (strand == '+') and (n50p + n50m >= mintags) and (n50p + n50m < maxtags): ## Shawn : only + motif included X.chrom.append(f[0]) #X.start.append( start - (CUT_END-CUT_START)/2 ) #X.end.append( start + (CUT_END-CUT_START)/2 ) X.start.append(start - (CUT_END - CUT_START) / 2 + startoffset) X.end.append(start - (CUT_END - CUT_START) / 2 + endoffset) X.strand.append(strand) X.name.append(seq) X.val.append((motifscore, n50p + n50m)) k += 1 poscut = [float(z) for z in f[CUT_START:CUT_END]] negcut = [ float(z) for z in f[CUT_END:2 * CUT_END + 2 - CUT_START] ] ##Shawn : negcut = [ float(z) for z in f[ CUT_END: 2*CUT_END-CUT_START ] ] cuts_pos.append(poscut[startoffset:endoffset]) cuts_neg.append(negcut[startoffset:endoffset]) if k == maxlines: break X.getSequence() #for i,elem in enumerate( X.seq ): # print X.name[i], elem[ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10 ], X.strand[i], '\t'.join( [ '%3.1f' % x for x in cuts_pos[i][ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10 ] ] ) #X.seq[i] = elem[ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10 ] #cuts_pos[i] = cuts_pos[i][ (CUT_END-CUT_START)/2: (CUT_END-CUT_START)/2 + 10 ] #return return X, numpy.array(cuts_pos), numpy.array(cuts_neg)
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,pspan,fetch_length=100,gen='hg19'): p=BwIO(pcut) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] pcutbw = BigWigFile(open(pcut, 'rb')) ncutbw = BigWigFile(open(ncut, 'rb')) inf = open(inputfile) pp=[] pm=[] X = c.interval(genome=gen) X.chrom,X.start,X.end,X.val = [],[],[],[] pBG,nBG = readBG(BGmatrix) for line in inf: ll = line.split() if not chrom_len.has_key(ll[0]): continue pout = make_cut(pcutbw,ll,pspan,fetch_length) nout = make_cut(ncutbw,ll,pspan,fetch_length) if ll[5] == "-": pout,nout = nout,pout if pout == 'NA': continue #print len(pout),len(nout),ll[:3] pp.append(pout) pm.append(nout) X.chrom.append(ll[0]) X.start.append(int(ll[1])-pspan -3 + 1) X.end.append(int(ll[2]) + pspan +3 + 1) X.val.append(ll[5]) #total[ ( flength - span ) : ( flength + int(ll[2]) - int(ll[1]) + span ) ] meanp = apply_mean(pp) meanm = apply_mean(pm) X.getSequence() pbglist = [] nbglist = [] for i,elem in enumerate(X.seq): seq = X.seq[i] strand = X.val[i] if 'N' in seq.upper(): continue pseq = seq[:-1] nseq = seq[1:] #if 'N' in pseq or 'N' in nseq: # continue p=[] n=[] for k in range(len(pseq) +1 - 6): p.append(pBG[pseq[k:k+6].upper()]) n.append(nBG[nseq[k:k+6].upper()]) if strand != '-': pbglist.append(p) nbglist.append(n) else: pbglist.append(n[::-1]) nbglist.append(p[::-1]) #print nbglist meanpbglist = apply_mean(pbglist) meanmbglist = apply_mean(nbglist) plot_template(meanp,meanm,meanpbglist,meanmbglist,outputfile)
def make_template(data, flank, pflank, topmotif, out, pbw, mbw, bgmatrix, gen): w_plus_H = BigWigFile(open(pbw, 'rb')) w_minus_H = BigWigFile(open(mbw, 'rb')) i = 0 templatelist = [] pp = [] pm = [] inf = open(data) l1st = inf.readline().split() ml = int(l1st[2]) - int(l1st[1]) inf.seek(0) for line in inf: #if i >= topmotif: # break ll = line.split() templatelist.append(ll) inf.close() templatelist.sort(key=lambda x: float(x[4]), reverse=True) ### for cut sitepro for ll in templatelist: p_sum = list( w_plus_H.summarize(ll[0], int(ll[1]) - flank, int(ll[1]) + flank, 2 * flank).sum_data) m_sum = list( w_minus_H.summarize(ll[0], int(ll[1]) - flank, int(ll[1]) + flank, 2 * flank).sum_data) if ll[5] == "+": pp.append(p_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 + pflank)]) pm.append(m_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 + pflank)]) if ll[5] == '-': pm.append(p_sum[::-1][(flank + 1 + ml / 2 - 1 - ml - pflank):(flank + 1 + ml / 2 - 1 - ml + pflank)]) pp.append(m_sum[::-1][(flank + 1 + ml / 2 - 1 - ml - pflank):(flank + 1 + ml / 2 - 1 - ml + pflank)]) print pp print pm meanp = apply_mean(pp) meanm = apply_mean(pm) allsum = sum(meanp) + sum(meanm) P = [] M = [] for i in range(len(meanp)): P.append(meanp[i]) #/allsum) M.append(meanm[i]) #/allsum) ### for seqbias bg pBG = {} nBG = {} inf = open(bgmatrix) for line in inf: ll = line.split() name = ll[0] pBG[name] = float(ll[1]) nBG[name] = float(ll[2]) inf.close() X = c.interval(genome=gen) X.chrom, X.start, X.end, X.val = [], [], [], [] for ll in templatelist: X.chrom.append(ll[0]) X.start.append(int(ll[1]) + 1 - flank) X.end.append(int(ll[1]) + 1 + flank) X.val.append(ll[5]) X.getSequence() pbglist = [] nbglist = [] for i, elem in enumerate(X.seq): seq = X.seq[i] strand = X.val[i] if strand != '+' or 'N' in seq or 'n' in seq: continue pseq = seq[(flank + 1 + ml / 2 - pflank - 3):(flank + 1 + ml / 2 + pflank + 2)] nseq = seq[(flank + 1 + ml / 2 - pflank - 2):(flank + 1 + ml / 2 + pflank + 3)] #if 'N' in pseq or 'N' in nseq: # continue p = [] n = [] for k in range(len(pseq) + 1 - 6): p.append(pBG[pseq[k:k + 6].upper()]) n.append(nBG[nseq[k:k + 6].upper()]) pbglist.append(p) nbglist.append(n) #print pbglist #print nbglist meanpbglist = apply_mean(pbglist) meanmbglist = apply_mean(nbglist) allsum = sum(meanpbglist) + sum(meanmbglist) Plusbg = [] Minusbg = [] for i in range(len(meanpbglist)): Plusbg.append(meanpbglist[i]) #/allsum) Minusbg.append(meanmbglist[i]) #/allsum) plot_template(P, M, Plusbg, Minusbg, out)