def orforflistmatch(s_orf,orflist): """ Which annotations associated with s_orf are significantly overrepresented in orflist? """ if not _orfs_by_go: load_GO() norfs = len(orflist) totorfs = len(_gos_by_orf.keys()) categories = [] for anno in annotations(s_orf): categories.append(anno.desc) sigs = [] for category in categories: all_by_cat = annotation2orfs(category) nall = len(all_by_cat) fracall = float(nall)/float(totorfs) sub_by_cat = [x for x in orflist if x in all_by_cat] nsub = len(sub_by_cat) if norfs == 0: fracsub = 0 else: fracsub = float(nsub)/float(norfs) sig = Arith.hypgeomsummore(nall,totorfs,norfs,nsub) sigs.append( (sig, category, fracall, fracsub, sub_by_cat) ) #if s_orf == SGD.gene2orf('NRG1'): # print '@ NRG1 %5.2e %-40s %4d %4d %4d %4d '%( # sig,category,nall,totorfs,norfs,nsub) sigs.sort() #for sig in sigs: print sig return sigs
def tablemotifs(_motifs): file = _motifs[0].file gifdir = 'logos_'+file+'.dir' motifs = [m.trimmed(0.25) for m in _motifs] cleanmotifs(motifs) top = getarg('top') if top: motifs = motifs[0:top] for m in motifs: if m.source: toks = m.source.split() m.name = toks[0].split('_')[0] if (len(toks) > 2) and ( toks[2] != '[]'): ids = toks[2].split(',') m.matchids = ', '.join([x.split('-')[-1] for x in ids]) else: m.matchids =' ' else: m.name = '' m.matchids = '' for i in range(len(motifs)): j = i + 1 motifs[i].i = j if getarg('GIF'): gifname = motifs[i].giflogo(id='%s.%s'%(file,i), title=' ', scale=0.6, info_str=' ') if (not os.path.exists(gifdir)): os.mkdir(gifdir) motifs[i].gifname = '%s/%s'%(gifdir,gifname) os.rename(gifname,motifs[i].gifname) txts = [] for m in motifs: s= '' s=s+ '<tr>' s=s+ '<td>%s</td>'%(m.name) s=s+ '<td>%d</td>'%(m.i) s=s+ '<td>%s</td>'%(m.matchids) if getarg('GIF'): s=s+ '<td><img src="%s"></td>'%(m.gifname) else: s=s+ '<td>%s</td>'%(m.oneletter) #s=s+ '<td>%5.2f</td>'%(m.i) try: s=s+ '<td>%3d</td><td>%5.1f%%</td>'%(m.numprobes,m.frac*100) except: s=s+ '<td>n/a</td><td>n/a</td>' s=s+ '<td>%5.2f</td>'%(Arith.nlog10(m.pvalue)) try: s=s+ '<td>%5.2f</td>'%(m.ROC_auc) except: s=s+ '<td>%5.2f</td>'%(0.0) if getarg('MAP'): s=s+ '<td>%6.2f</td>'%(m.MAP) if getarg('CONS'): s=s+ '<td>%3.0f%% (%d)</td>'%(m.Cfrac*100,m.numconsmatchbound) s=s+ '<td>%d/%d -> %6.2f</td>'%(m.inC,m.inC+m.inU,m.CSimproved) if getarg('PROGRAMS'): s=s+ '<td>%s</td>'%("<br>".join(m.programs)) s=s+ "</tr>\n" txts.append(s) print ''.join(txts); sys.stdout.flush()
def probOvlpBinomial(A,B,thresh=0.7,verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement newWide = Wide[-1,Wide.width+1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh*newWide.maxscore): bestWideD[x[1]] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide D={} for i in range(len(bestWide)): D[i] = bestWide[i] P = ProbeSet(genome=D) matchNarrow = P.count_matching_probes(Narrow,thresh=thresh) if matchNarrow == 0: p = 1.0 return p if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {} if Narrow.probNarrow.has_key(Wide.width): probNarrow = Narrow.probNarrow[Wide.width] else: probNarrow = estimate_frequency(Narrow,Wide.width,thresh=thresh) Narrow.probNarrow[Wide.width] = probNarrow p = Arith.binomialsumtail(probNarrow,len(bestWide),matchNarrow) print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s'%( Arith.nlog10(p),probNarrow,len(bestWide),matchNarrow, A.family,A,B.family,B) return p
def probOvlpBinomial(A, B, thresh=0.7, verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement newWide = Wide[-1, Wide.width + 1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh * newWide.maxscore): bestWideD[x[1]] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide D = {} for i in range(len(bestWide)): D[i] = bestWide[i] P = ProbeSet(genome=D) matchNarrow = P.count_matching_probes(Narrow, thresh=thresh) if matchNarrow == 0: p = 1.0 return p if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {} if Narrow.probNarrow.has_key(Wide.width): probNarrow = Narrow.probNarrow[Wide.width] else: probNarrow = estimate_frequency(Narrow, Wide.width, thresh=thresh) Narrow.probNarrow[Wide.width] = probNarrow p = Arith.binomialsumtail(probNarrow, len(bestWide), matchNarrow) print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s' % (Arith.nlog10( p), probNarrow, len(bestWide), matchNarrow, A.family, A, B.family, B) return p
def ace2tamo(filename, tamoname): global probefile, PROBESET if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) elif re.search('\.meme$',filename): mdobject = Meme.Meme(filename) fsaname = find_fsa(mdobject.fastafile) fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('HUMAN_250') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in mdobject.motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) sys.stdout.flush() i = 0 for motif in mdobject.motifs: motif.seednum = i ; i=i+1 kmers = motif.bogus_kmers(100) motif.maxscore = -100 scores = [motif.scan(kmer)[2][0] for kmer in kmers] print Arith.avestd(scores) if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(mdobject.motifs,tamoname)
def orflist2categories_long(orflist,thresh=0.05): """ Which categories are overrepresented among orflist. Thresh is applied after Bonferroni correction. Returns: Sorted list of tuples, [(signifance, category, fracall, fracsub, orflist), ...] """ if not _orfs_by_go: load_GO() norfs = len(orflist) totorfs = len(_gos_by_orf.keys()) #Determine which categories are described by the set categories = [] for orf in orflist: for anno in annotations(orf): if anno.desc not in categories: categories.append(anno.desc) totcats = float(len(categories)) sigs = [] for category in categories: all_by_cat = annotation2orfs(category) nall = len(all_by_cat) fracall = float(nall)/float(totorfs) sub_by_cat = [x for x in orflist if x in all_by_cat] nsub = len(sub_by_cat) fracsub = float(nsub)/float(norfs) sig = Arith.hypgeomsummore(nall,totorfs,norfs,nsub) * totcats #print category,sig,nall,totorfs,norfs,nsub,' ',totcats sigs.append( (sig, category, fracall, fracsub, sub_by_cat) ) sigs.sort() ans = [] for sigdata in sigs: sig, category, fracall, fracsub, orfs = sigdata if sig > thresh: continue ans.append(sigdata) return ans
def NLBPO(A,B,thresh): p = probOvlpBinomial(A,B,thresh) return Arith.nlog10(p)
def NLPO(A,B,thresh): p = probOvlp(A,B,thresh) return Arith.nlog10(p)
def probOvlp(A,B,thresh=0.7,verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement if 1: newWide = Wide[-1,Wide.width+1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh*newWide.maxscore): bestWideD[x] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide if Narrow.__dict__.has_key('bestNarrow'): bestNarrow = Narrow.bestNarrow else: bestNarrowD = {} for x in Narrow.bestseqs(thresh*Narrow.maxscore): bestNarrowD[x] = 1 for x in bestNarrowD.keys(): bestNarrowD[RC(x)] = 1 bestNarrow = bestNarrowD.keys() Narrow.bestNarrow = bestNarrow #bestWide = [x[1] for x in Wide.bestseqs (thresh*Wide.maxscore) ] #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)] countNarrow = len(bestNarrow) countWide = len(bestWide) numtotal = math.pow(4,Wide.width) fudgefactor = math.pow(4,Wide.width - Narrow.width) bestWideTups = [(x,MotifTools.revcomplement(x)) for x in bestWide] countBoth = 0 for i in range(len(bestNarrow)): m_narrow = bestNarrow[i] delj = [] for j in range(len(bestWideTups)): if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0): countBoth += 1 delj.append(j) delj.reverse() #Chew in from the back for j in delj: del(bestWideTups[j]) if verbose: print '%10d %10d %10d %10d | %10d %5d '%( countWide, numtotal, countNarrow *fudgefactor , countBoth , countNarrow, Wide.width - Narrow.width), p = Arith.hypgeomsummore(countWide, #Num Interesting numtotal, #All k-mers countNarrow * fudgefactor, #Number picked countBoth ) #Number found return p
def conservation_pvalue(nmer, IDs, fsaDict, ConsDict, num_alignments): width = len(nmer) total = [] unconserved = [] thetas = [] for i in range(num_alignments-1): total.append(0) unconserved.append(0) tot_positions = 0 tot_unconserved = 0 for ID in IDs: try: cons = ConsDict[ID] except: continue if (cons[i]==[]): continue try: tot_positions = tot_positions + len(cons[i]) tot_unconserved = tot_unconserved + cons[i].count(1) except: print "cons: %s"%cons try: thetas.append(float(tot_unconserved)/tot_positions) except: thetas.append(1.0) #print thetas for ID in IDs: seq = fsaDict[ID] seqrc = ConvergeMotifTools.revcomplement(seq) try: cons = ConsDict[ID] except: continue if (cons==[]): continue hits = [] hitsr = [] if (type(nmer)==type(ConvergeMotifTools.Motif())): hits = cluster.matches_old(nmer, seq, 0.7) #print len(hits) else: site_re = re.compile(AmbigToRegExp(nmer)) hit = site_re.search(seq) hitr = site_re.search(seqrc) while (hit!=None): hits.append(hit.start()) hit = site_re.search(seq,hit.end()) while (hitr!=None): if (hits.count(len(seq)-hitr.end()-1)==0): hits.append(len(seq) - hitr.end()-1) hitr = site_re.search(seqrc,hitr.end()) for hit in hits: for i in range(width): for j in range(num_alignments-1): if (cons[j]!=[]): total[j] = total[j] + 1 unconserved[j] = unconserved[j] + cons[j][hit+i] mean = 0 var = 0 uc = 0 for i in range(num_alignments-1): m = total[i]*thetas[i] #nq = total[i]*(1-thetas[i]) #if (min(nq,m)<5): return(0.5) mean = mean + m var = var + total[i]*thetas[i]*(1-thetas[i]) uc = uc + unconserved[i] stdev = math.sqrt(var) if (stdev>0): Z = ((uc + 0.5)-mean)/stdev else: Z = 0 if (Z>=0): p = Arith.lzprob(Z) else: p = 1.0 - Arith.lzprob(-Z) return(p)
def WMWtest(A, B): """ WMWtest(A,B) -- Computes the Wilcoxon-Mann-Whitney nonparametric W statistic for two distributions input: list of numbers, list of numbers output: p-value, W-statistic """ A.sort() B.sort() TotalList = A + B TotalList.sort() nA = len(A) nB = len(B) N = nA + nB MaxSum = N * (N + 1) / 2.0 H0 = MaxSum / 2.0 ## Replace values by ranks previous = [] start = 0 Total_rank = TotalList[:] for i in range(len(TotalList)): if (TotalList[i] == previous): mean_rank = (start + i + 2) / 2.0 for j in range(start, i + 1): Total_rank[j] = mean_rank else: Total_rank[i] = i + 1 previous = TotalList[i] start = i ## Determine the shortest list if nA < nB: shortest = A else: shortest = B nShortest = len(shortest) ## Summ the ranks in the shortest list W = 0 for Value in shortest: i = 0 while (i < len(TotalList) and Value != TotalList[i]): i += 1 W += Total_rank[i] ## Use the smallest value of $W if (W > H0): W = MaxSum - W ## Determine the two-tailed level of significance p = 0 ## First calculate the Normal approximation. This can be used to ## check whether a significant result is plausable for larger N. Permutations = k_out_n(nA, N) if (Permutations >= 25000) or (nShortest > 10): if W >= H0: Continuity = -0.5 else: Continuity = 0.5 Z = (W + Continuity - nShortest * (N + 1.0) / 2.0) / sqrt(nA * nB * (N + 1) / 12.0) Z = fabs(Z) p = 2 * (1 - Arith.lzprob(Z)) ## The exact level of significance, for large N, first check whether a ## significant result is plausable, i.e., the Normal Approximation gives ## a $p < 0.25. if (nShortest + 1 < 10) and (p < 0.25) and (Permutations < 60000): # Remember that $W must be SMALLER than $MaxSum/2=$H0 Less = CountSmallerRanks(W, 0, len(shortest) - 1, 0, Total_rank) # If $Less < $Permutations/2, we have obviously calculated the # wrong way. We should have calculated UPWARD (higher than W) # We can't do that, but we can calculate $Less for $W-1 and # subtract it from $Permutations if (2 * Less > Permutations): Less = CountSmallerRanks(W - 1, 0, len(shortest) - 1, 0, Total_rank) Less = Permutations - Less SumFrequencies = Permutations p = 2.0 * Less / SumFrequencies return p, W
def conservation_pvalue(nmer, IDs, fsaDict, ConsDict, num_alignments): width = len(nmer) total = [] unconserved = [] thetas = [] for i in range(num_alignments - 1): total.append(0) unconserved.append(0) tot_positions = 0 tot_unconserved = 0 for ID in IDs: try: cons = ConsDict[ID] except: continue if (cons[i] == []): continue try: tot_positions = tot_positions + len(cons[i]) tot_unconserved = tot_unconserved + cons[i].count(1) except: print "cons: %s" % cons try: thetas.append(float(tot_unconserved) / tot_positions) except: thetas.append(1.0) #print thetas for ID in IDs: seq = fsaDict[ID] seqrc = ConvergeMotifTools.revcomplement(seq) try: cons = ConsDict[ID] except: continue if (cons == []): continue hits = [] hitsr = [] if (type(nmer) == type(ConvergeMotifTools.Motif())): hits = cluster.matches_old(nmer, seq, 0.7) #print len(hits) else: site_re = re.compile(AmbigToRegExp(nmer)) hit = site_re.search(seq) hitr = site_re.search(seqrc) while (hit != None): hits.append(hit.start()) hit = site_re.search(seq, hit.end()) while (hitr != None): if (hits.count(len(seq) - hitr.end() - 1) == 0): hits.append(len(seq) - hitr.end() - 1) hitr = site_re.search(seqrc, hitr.end()) for hit in hits: for i in range(width): for j in range(num_alignments - 1): if (cons[j] != []): total[j] = total[j] + 1 unconserved[j] = unconserved[j] + cons[j][hit + i] mean = 0 var = 0 uc = 0 for i in range(num_alignments - 1): m = total[i] * thetas[i] #nq = total[i]*(1-thetas[i]) #if (min(nq,m)<5): return(0.5) mean = mean + m var = var + total[i] * thetas[i] * (1 - thetas[i]) uc = uc + unconserved[i] stdev = math.sqrt(var) if (stdev > 0): Z = ((uc + 0.5) - mean) / stdev else: Z = 0 if (Z >= 0): p = Arith.lzprob(Z) else: p = 1.0 - Arith.lzprob(-Z) return (p)
def NLBPO(A, B, thresh): p = probOvlpBinomial(A, B, thresh) return Arith.nlog10(p)
def NLPO(A, B, thresh): p = probOvlp(A, B, thresh) return Arith.nlog10(p)
def probOvlp(A, B, thresh=0.7, verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement if 1: newWide = Wide[-1, Wide.width + 1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh * newWide.maxscore): bestWideD[x] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide if Narrow.__dict__.has_key('bestNarrow'): bestNarrow = Narrow.bestNarrow else: bestNarrowD = {} for x in Narrow.bestseqs(thresh * Narrow.maxscore): bestNarrowD[x] = 1 for x in bestNarrowD.keys(): bestNarrowD[RC(x)] = 1 bestNarrow = bestNarrowD.keys() Narrow.bestNarrow = bestNarrow #bestWide = [x[1] for x in Wide.bestseqs (thresh*Wide.maxscore) ] #bestNarrow = [x[1] for x in Narrow.bestseqs(thresh*Narrow.maxscore)] countNarrow = len(bestNarrow) countWide = len(bestWide) numtotal = math.pow(4, Wide.width) fudgefactor = math.pow(4, Wide.width - Narrow.width) bestWideTups = [(x, MotifTools.revcomplement(x)) for x in bestWide] countBoth = 0 for i in range(len(bestNarrow)): m_narrow = bestNarrow[i] delj = [] for j in range(len(bestWideTups)): if (bestWideTups[j][0].find(m_narrow) >= 0) or (bestWideTups[j][1].find(m_narrow) >= 0): countBoth += 1 delj.append(j) delj.reverse() #Chew in from the back for j in delj: del (bestWideTups[j]) if verbose: print '%10d %10d %10d %10d | %10d %5d ' % ( countWide, numtotal, countNarrow * fudgefactor, countBoth, countNarrow, Wide.width - Narrow.width), p = Arith.hypgeomsummore( countWide, #Num Interesting numtotal, #All k-mers countNarrow * fudgefactor, #Number picked countBoth) #Number found return p
def WMWtest(A,B): """ WMWtest(A,B) -- Computes the Wilcoxon-Mann-Whitney nonparametric W statistic for two distributions input: list of numbers, list of numbers output: p-value, W-statistic """ A.sort() B.sort() TotalList = A + B TotalList.sort() nA = len(A) nB = len(B) N = nA + nB MaxSum = N*(N+1)/2.0 H0 = MaxSum / 2.0 ## Replace values by ranks previous = [] start = 0 Total_rank = TotalList[:] for i in range(len(TotalList)): if (TotalList[i] == previous): mean_rank = (start+i+2)/2.0 for j in range(start,i+1): Total_rank[j] = mean_rank else: Total_rank[i] = i+1 previous = TotalList[i] start = i ## Determine the shortest list if nA < nB: shortest = A else: shortest = B nShortest = len(shortest); ## Summ the ranks in the shortest list W = 0 for Value in shortest: i = 0 while (i < len(TotalList) and Value != TotalList[i]): i += 1 W += Total_rank[i] ## Use the smallest value of $W if (W > H0): W = MaxSum - W ## Determine the two-tailed level of significance p = 0 ## First calculate the Normal approximation. This can be used to ## check whether a significant result is plausable for larger N. Permutations = k_out_n(nA, N) if (Permutations >= 25000) or (nShortest > 10): if W >= H0: Continuity = -0.5 else: Continuity = 0.5 Z = (W+Continuity-nShortest*(N+1.0)/2.0)/sqrt(nA*nB*(N+1)/12.0); Z = fabs(Z) p = 2*(1-Arith.lzprob(Z)) ## The exact level of significance, for large N, first check whether a ## significant result is plausable, i.e., the Normal Approximation gives ## a $p < 0.25. if (nShortest+1 < 10) and (p < 0.25) and (Permutations < 60000): # Remember that $W must be SMALLER than $MaxSum/2=$H0 Less = CountSmallerRanks(W, 0 , len(shortest)-1, 0, Total_rank) # If $Less < $Permutations/2, we have obviously calculated the # wrong way. We should have calculated UPWARD (higher than W) # We can't do that, but we can calculate $Less for $W-1 and # subtract it from $Permutations if (2*Less > Permutations): Less = CountSmallerRanks(W-1, 0, len(shortest)-1, 0, Total_rank) Less = Permutations - Less SumFrequencies = Permutations p = 2.0 * Less / SumFrequencies return p, W