def conservation_pvalue(nmer, IDs, fsaDict, ConsDict, num_alignments): width = len(nmer) total = [] unconserved = [] thetas = [] for i in range(num_alignments-1): total.append(0) unconserved.append(0) tot_positions = 0 tot_unconserved = 0 for ID in IDs: try: cons = ConsDict[ID] except: continue if (cons[i]==[]): continue try: tot_positions = tot_positions + len(cons[i]) tot_unconserved = tot_unconserved + cons[i].count(1) except: print "cons: %s"%cons try: thetas.append(float(tot_unconserved)/tot_positions) except: thetas.append(1.0) #print thetas for ID in IDs: seq = fsaDict[ID] seqrc = ConvergeMotifTools.revcomplement(seq) try: cons = ConsDict[ID] except: continue if (cons==[]): continue hits = [] hitsr = [] if (type(nmer)==type(ConvergeMotifTools.Motif())): hits = cluster.matches_old(nmer, seq, 0.7) #print len(hits) else: site_re = re.compile(AmbigToRegExp(nmer)) hit = site_re.search(seq) hitr = site_re.search(seqrc) while (hit!=None): hits.append(hit.start()) hit = site_re.search(seq,hit.end()) while (hitr!=None): if (hits.count(len(seq)-hitr.end()-1)==0): hits.append(len(seq) - hitr.end()-1) hitr = site_re.search(seqrc,hitr.end()) for hit in hits: for i in range(width): for j in range(num_alignments-1): if (cons[j]!=[]): total[j] = total[j] + 1 unconserved[j] = unconserved[j] + cons[j][hit+i] mean = 0 var = 0 uc = 0 for i in range(num_alignments-1): m = total[i]*thetas[i] #nq = total[i]*(1-thetas[i]) #if (min(nq,m)<5): return(0.5) mean = mean + m var = var + total[i]*thetas[i]*(1-thetas[i]) uc = uc + unconserved[i] stdev = math.sqrt(var) if (stdev>0): Z = ((uc + 0.5)-mean)/stdev else: Z = 0 if (Z>=0): p = Arith.lzprob(Z) else: p = 1.0 - Arith.lzprob(-Z) return(p)
def conservation_pvalue(nmer, IDs, fsaDict, ConsDict, num_alignments): width = len(nmer) total = [] unconserved = [] thetas = [] for i in range(num_alignments - 1): total.append(0) unconserved.append(0) tot_positions = 0 tot_unconserved = 0 for ID in IDs: try: cons = ConsDict[ID] except: continue if (cons[i] == []): continue try: tot_positions = tot_positions + len(cons[i]) tot_unconserved = tot_unconserved + cons[i].count(1) except: print "cons: %s" % cons try: thetas.append(float(tot_unconserved) / tot_positions) except: thetas.append(1.0) #print thetas for ID in IDs: seq = fsaDict[ID] seqrc = ConvergeMotifTools.revcomplement(seq) try: cons = ConsDict[ID] except: continue if (cons == []): continue hits = [] hitsr = [] if (type(nmer) == type(ConvergeMotifTools.Motif())): hits = cluster.matches_old(nmer, seq, 0.7) #print len(hits) else: site_re = re.compile(AmbigToRegExp(nmer)) hit = site_re.search(seq) hitr = site_re.search(seqrc) while (hit != None): hits.append(hit.start()) hit = site_re.search(seq, hit.end()) while (hitr != None): if (hits.count(len(seq) - hitr.end() - 1) == 0): hits.append(len(seq) - hitr.end() - 1) hitr = site_re.search(seqrc, hitr.end()) for hit in hits: for i in range(width): for j in range(num_alignments - 1): if (cons[j] != []): total[j] = total[j] + 1 unconserved[j] = unconserved[j] + cons[j][hit + i] mean = 0 var = 0 uc = 0 for i in range(num_alignments - 1): m = total[i] * thetas[i] #nq = total[i]*(1-thetas[i]) #if (min(nq,m)<5): return(0.5) mean = mean + m var = var + total[i] * thetas[i] * (1 - thetas[i]) uc = uc + unconserved[i] stdev = math.sqrt(var) if (stdev > 0): Z = ((uc + 0.5) - mean) / stdev else: Z = 0 if (Z >= 0): p = Arith.lzprob(Z) else: p = 1.0 - Arith.lzprob(-Z) return (p)
def WMWtest(A, B): """ WMWtest(A,B) -- Computes the Wilcoxon-Mann-Whitney nonparametric W statistic for two distributions input: list of numbers, list of numbers output: p-value, W-statistic """ A.sort() B.sort() TotalList = A + B TotalList.sort() nA = len(A) nB = len(B) N = nA + nB MaxSum = N * (N + 1) / 2.0 H0 = MaxSum / 2.0 ## Replace values by ranks previous = [] start = 0 Total_rank = TotalList[:] for i in range(len(TotalList)): if (TotalList[i] == previous): mean_rank = (start + i + 2) / 2.0 for j in range(start, i + 1): Total_rank[j] = mean_rank else: Total_rank[i] = i + 1 previous = TotalList[i] start = i ## Determine the shortest list if nA < nB: shortest = A else: shortest = B nShortest = len(shortest) ## Summ the ranks in the shortest list W = 0 for Value in shortest: i = 0 while (i < len(TotalList) and Value != TotalList[i]): i += 1 W += Total_rank[i] ## Use the smallest value of $W if (W > H0): W = MaxSum - W ## Determine the two-tailed level of significance p = 0 ## First calculate the Normal approximation. This can be used to ## check whether a significant result is plausable for larger N. Permutations = k_out_n(nA, N) if (Permutations >= 25000) or (nShortest > 10): if W >= H0: Continuity = -0.5 else: Continuity = 0.5 Z = (W + Continuity - nShortest * (N + 1.0) / 2.0) / sqrt(nA * nB * (N + 1) / 12.0) Z = fabs(Z) p = 2 * (1 - Arith.lzprob(Z)) ## The exact level of significance, for large N, first check whether a ## significant result is plausable, i.e., the Normal Approximation gives ## a $p < 0.25. if (nShortest + 1 < 10) and (p < 0.25) and (Permutations < 60000): # Remember that $W must be SMALLER than $MaxSum/2=$H0 Less = CountSmallerRanks(W, 0, len(shortest) - 1, 0, Total_rank) # If $Less < $Permutations/2, we have obviously calculated the # wrong way. We should have calculated UPWARD (higher than W) # We can't do that, but we can calculate $Less for $W-1 and # subtract it from $Permutations if (2 * Less > Permutations): Less = CountSmallerRanks(W - 1, 0, len(shortest) - 1, 0, Total_rank) Less = Permutations - Less SumFrequencies = Permutations p = 2.0 * Less / SumFrequencies return p, W
def WMWtest(A,B): """ WMWtest(A,B) -- Computes the Wilcoxon-Mann-Whitney nonparametric W statistic for two distributions input: list of numbers, list of numbers output: p-value, W-statistic """ A.sort() B.sort() TotalList = A + B TotalList.sort() nA = len(A) nB = len(B) N = nA + nB MaxSum = N*(N+1)/2.0 H0 = MaxSum / 2.0 ## Replace values by ranks previous = [] start = 0 Total_rank = TotalList[:] for i in range(len(TotalList)): if (TotalList[i] == previous): mean_rank = (start+i+2)/2.0 for j in range(start,i+1): Total_rank[j] = mean_rank else: Total_rank[i] = i+1 previous = TotalList[i] start = i ## Determine the shortest list if nA < nB: shortest = A else: shortest = B nShortest = len(shortest); ## Summ the ranks in the shortest list W = 0 for Value in shortest: i = 0 while (i < len(TotalList) and Value != TotalList[i]): i += 1 W += Total_rank[i] ## Use the smallest value of $W if (W > H0): W = MaxSum - W ## Determine the two-tailed level of significance p = 0 ## First calculate the Normal approximation. This can be used to ## check whether a significant result is plausable for larger N. Permutations = k_out_n(nA, N) if (Permutations >= 25000) or (nShortest > 10): if W >= H0: Continuity = -0.5 else: Continuity = 0.5 Z = (W+Continuity-nShortest*(N+1.0)/2.0)/sqrt(nA*nB*(N+1)/12.0); Z = fabs(Z) p = 2*(1-Arith.lzprob(Z)) ## The exact level of significance, for large N, first check whether a ## significant result is plausable, i.e., the Normal Approximation gives ## a $p < 0.25. if (nShortest+1 < 10) and (p < 0.25) and (Permutations < 60000): # Remember that $W must be SMALLER than $MaxSum/2=$H0 Less = CountSmallerRanks(W, 0 , len(shortest)-1, 0, Total_rank) # If $Less < $Permutations/2, we have obviously calculated the # wrong way. We should have calculated UPWARD (higher than W) # We can't do that, but we can calculate $Less for $W-1 and # subtract it from $Permutations if (2*Less > Permutations): Less = CountSmallerRanks(W-1, 0, len(shortest)-1, 0, Total_rank) Less = Permutations - Less SumFrequencies = Permutations p = 2.0 * Less / SumFrequencies return p, W