def prob(sample, reference): sample_mean = 0 sample_var_lo = 1 sample_var_hi = 1 n_samples = 0 for col in sample.index: if col.endswith('FPKM') or '_' not in col: sample_mean += sample[col] n_samples += 1 col_base = col.strip('_FPKM') if col_base + '_conf_lo' in sample.index: D_lo = sample[col] - sample[col_base + '_conf_lo'] D_hi = sample[col_base + '_conf_hi'] - sample[col] sample_var_lo += D_lo**2 sample_var_hi += D_hi**2 else: sample_var_lo += sample[col] sample_var_hi += sample[col] sample_mean /= n_samples lo = sample_mean - np.sqrt(sample_var_lo) hi = sample_mean + np.sqrt(sample_var_hi) lo_prob = stats.zprob((lo - np.mean(reference,axis=1)) / (np.std(reference,axis=1) + 10)) hi_prob = stats.zprob((hi - np.mean(reference,axis=1)) / (np.std(reference,axis=1) + 10)) return float(hi_prob - lo_prob)
def wilcoxon_test(a, b, alpha = .05): """ Performs the Wilcoxon Rank non-parametric test for two algorithms. """ N = len(a) # Number of datasets # Compute the differences and keep the signs differences, signs = ([abs(a[i] - b[i]) for i in xrange(N)], [a[i] - b[i] for i in xrange(N)]) tmp = sorted(differences) # The rank is the median between the index of the first element equal # to v in tmp (index(v)+1) and the index of the last element equal to v # (index(v)+count(v)) ranks = [(tmp.count(v)+tmp.index(v)*2+0x1)/2e0 for v in differences] # Add up the ranks for positive and negative signs r_plus = r_minus = 0.0 for i in xrange(N): if signs[i] < 0: r_minus += ranks[i] elif signs[i] > 0: r_plus += ranks[i] else: r_minus += ranks[i]*2**-1 r_plus += ranks[i]/2. # Compute the minimum of both sums T = min([r_plus, r_minus]) # Check if it can be approximated by a gaussian distribution if N <= 30: return {"result" : T > wilcoxon_table[alpha][N], "statistic" : T, "critical" : wilcoxon_table[alpha][N]} else: z = (T - N*(N + 1)/4) / math.sqrt(N*(N + 1)*(2*N + 1)/24) return {"result" : st.zprob(z)*2 > alpha, "statistic" : z, "critical" : st.zprob(z)}
def probability_of_f2_being_better_than(self, design): m1 = self.f2_mean s1 = self.f2_std n1 = len(self.f2_vals) m2 = design.f2_mean s2 = design.f2_std n2 = len(design.f2_vals) t = -(m1 - m2) / ( s1**2/n1 + s2**2/n2 ) ** 0.5 return stats.zprob(t)
def wald_wolfowitz(sequence): """ implements the wald-wolfowitz runs test: http://en.wikipedia.org/wiki/Wald-Wolfowitz_runs_test http://support.sas.com/kb/33/092.html :param sequence: any iterable with at most 2 values. e.g. '1001001' [1, 0, 1, 0, 1] 'abaaabbba' :rtype: a dict with keys of `n_runs`: the number of runs in the sequence `p`: the support to reject the null-hypothesis that the number of runs supports a random sequence `z`: the z-score, used to calculate the p-value `sd`, `mean`: the expected standard deviation, mean the number of runs, given the ratio of numbers of 1's/0's in the sequence >>> r = wald_wolfowitz('1000001') >>> r['n_runs'] # should be 3, because 1, 0, 1 3 >>> r['p'] < 0.05 # not < 0.05 evidence to reject Ho of random sequence False # this should show significance for non-randomness >>> li = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] >>> wald_wolfowitz(li)['p'] < 0.05 True """ R = n_runs = sum(1 for s in groupby(sequence, lambda a: a)) n = float(sum(1 for s in sequence if s == sequence[0])) m = float(sum(1 for s in sequence if s != sequence[0])) # expected mean runs ER = ((2 * n * m ) / (n + m)) + 1 # expected variance runs VR = (2 * n * m * (2 * n * m - n - m )) / ((n + m)**2 * (n + m - 1)) O = (ER - 1) * (ER - 2) / (n + m - 1.) assert VR - O < 0.001, (VR, O) SD = math.sqrt(VR) # Z-score Z = (R - ER) / SD return {'z': Z, 'mean': ER, 'sd': SD, 'p': zprob(Z), 'n_runs': R}
def wald_wolfowitz(sequence): """ implements the wald-wolfowitz runs test: http://en.wikipedia.org/wiki/Wald-Wolfowitz_runs_test http://support.sas.com/kb/33/092.html :param sequence: any iterable with at most 2 values. e.g. '1001001' [1, 0, 1, 0, 1] 'abaaabbba' :rtype: a dict with keys of `n_runs`: the number of runs in the sequence `p`: the support to reject the null-hypothesis that the number of runs supports a random sequence `z`: the z-score, used to calculate the p-value `sd`, `mean`: the expected standard deviation, mean the number of runs, given the ratio of numbers of 1's/0's in the sequence >>> r = wald_wolfowitz('1000001') >>> r['n_runs'] # should be 3, because 1, 0, 1 3 >>> r['p'] < 0.05 # not < 0.05 evidence to reject Ho of random sequence False # this should show significance for non-randomness >>> li = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] >>> wald_wolfowitz(li)['p'] < 0.05 True """ R = n_runs = sum(1 for s in groupby(sequence, lambda a: a)) n = float(sum(1 for s in sequence if s == sequence[0])) m = float(sum(1 for s in sequence if s != sequence[0])) # expected mean runs ER = ((2 * n * m) / (n + m)) + 1 # expected variance runs VR = (2 * n * m * (2 * n * m - n - m)) / ((n + m)**2 * (n + m - 1)) O = (ER - 1) * (ER - 2) / (n + m - 1.) assert VR - O < 0.001, (VR, O) SD = math.sqrt(VR) # Z-score Z = (R - ER) / SD return {'z': Z, 'mean': ER, 'sd': SD, 'p': zprob(Z), 'n_runs': R}
def example_test_a_proportion(p=0.1, n=100, target=0.1, significance_level=0.95): """ The math behind testing a single proportion """ # sample standard deviation sigma = sqrt(p*(1-p)/n) # z-score z = (p-target)/sigma # check in statistical table prob = stats.zprob(z) # If we observe a large p-value, for example larger than 0.05 or 0.1, # then we cannot reject the null hypothesis of identical proportions alpha = 1 - significance_level if prob > alpha: return 'Proportion is equal to target' else: return 'Proportion is not equal to target'
def example_compare_two_proportions(p1=0.1, n1=100, p2=0.1, n2=100, significance_level=0.95): """ The math behind comparing two proportions """ # overall sample proportion p = ((p1*n1)+(p2*n2))/(n1+n2) # standard error se = sqrt(p*(1.-p)*((1./n1)+(1./n2))) # z-score z = (p1-p2)/se # check in statistical table prob = stats.zprob(z) # If we observe a large p-value, for example larger than 0.05 or 0.1, # then we cannot reject the null hypothesis of identical proportions alpha = 1 - significance_level if prob > alpha: return 'Proportions are equal' else: return 'Proportions are not equal'
def example_test_a_proportion(p=0.1, n=100, target=0.1, significance_level=0.95): """ The math behind testing a single proportion """ # sample standard deviation sigma = sqrt(p * (1 - p) / n) # z-score z = (p - target) / sigma # check in statistical table prob = stats.zprob(z) # If we observe a large p-value, for example larger than 0.05 or 0.1, # then we cannot reject the null hypothesis of identical proportions alpha = 1 - significance_level if prob > alpha: return 'Proportion is equal to target' else: return 'Proportion is not equal to target'
def example_compare_two_proportions(p1=0.1, n1=100, p2=0.1, n2=100, significance_level=0.95): """ The math behind comparing two proportions """ # overall sample proportion p = ((p1 * n1) + (p2 * n2)) / (n1 + n2) # standard error se = sqrt(p * (1. - p) * ((1. / n1) + (1. / n2))) # z-score z = (p1 - p2) / se # check in statistical table prob = stats.zprob(z) # If we observe a large p-value, for example larger than 0.05 or 0.1, # then we cannot reject the null hypothesis of identical proportions alpha = 1 - significance_level if prob > alpha: return 'Proportions are equal' else: return 'Proportions are not equal'
def getMoranGeary(self, myLayer, myBand, m): # formula for Moran's I # I = [ sum i=<1..n> sum j= <1..n> w(i,j) (x(i) - x(m)) (x(j) - x(m)) / # sum i=<1..n> (x(i) - x(m))^2 ] * # # [ n / sum i=<1..n> sum j= <1..n> w(i,j) ] # where n = number of pixels, # w(i,j) = weight (1 if j is next to i, 0 otherwise) # x(i) = value at position i # x(m) = global mean of layer # formula for Geary's C # [ sum i=<1..n> sum j= <1..n> w(i,j) (x(i) - x(j)) / # sum i=<1..n> (x(i) - x(m))^2 ] * # (n -1) / 2 * sum i=<1..n> sum j= <1..n> w(i,j) # variables as Moran's I # Variance Moran's I (assuming normality) # Variance = [ (n^2S1 - nS2 + 3S0^2) / (S0^2(n^2-1)) ] - E^2 # Where # S0= sum i=<1..n> sum j=<1..n> (w(ij)), i<>j # S1= 1/2 sum i=<1..n> sum j=<1..n> (w(ij) + w(ji))^2, i<>j # S2= sum i=<1..n> [sum j=<1..n> w(ij) + sum j=<1..n> w(ji)]^2 # E=Expected = 1/(N^2-1) # Zscore for Moran's I assuming normality # Zscore = I-E / Variance^0.5 # Variance Moran's I (randomisation test version) # Variance = [ [n((n^2-3n+3)S1 - nS2 + 3S0^2) - k((n^2-n)S1-2nS2+6S0^2)] ] / # [ (n-1)(n-2)(n-3)S0^2 ] ] - E^2 # Where S0,1,2,E as above # k = [ (sum i=<1..n> (x(i) - x(m))^4 ) / n ] / # [ (sum i=<1..n> (x(i) - x(m))^2 ) / n ]^2 # Zscore as above # initialise variables myN=0 # denominator for Moran & Geary are the same myDenominator=0 # Numerator is different myNumeratorMI=float(0) myNumeratorGC=float(0) myNumeratorCount=0 # initialise variables [myS0,myS1,myS2,myKNum,myKDenom,myK]=[0,0,0,0,0,0] [myMoranI,myVarianceMIAN,myZMIAN,myPMIAN, myVarianceMIRV,myZMIRV,myPMIRV, myGearyC,myVarianceGCAN,myZGCAN,myPGCAN, myVarianceGCRV,myZGCRV,myPGCRV]=[None,None,None,None,None, None,None,None,None,None, None,None,None,None] # remember ndv myNDV=QString(u'null (no data)') myOE=QString(u'out of extent') # set up extent parameters xMin=myLayer.extent().xMinimum() yMin=myLayer.extent().yMinimum() xMax=myLayer.extent().xMaximum() yMax=myLayer.extent().yMaximum() xDim=myLayer.width() yDim=myLayer.height() xSize=(xMax-xMin)/xDim ySize=(yMax-yMin)/yDim # loop through all points for i in range(xDim): x=xMin+(xSize/2)+(i*xSize) for j in range(yDim): y=yMin+(ySize/2)+(j*ySize) # get value zstr=myLayer.identify(QgsPoint(x,y))[1].values()[myBand] # do sums if we have a value if not zstr==myNDV: z=float(zstr) myN+=1 myDenominator+=pow(z-m,2) myKNum+=pow(z-m,4) myKDenom=myDenominator # same calculation at this point myS2_ct=0 # loop through adjacent points for ii in range(-1*self.Radius,self.Radius+1): xx=x+(ii*xSize) for jj in range(-1*self.Radius,self.Radius+1): yy=y+(jj*ySize) zzstr=myLayer.identify(QgsPoint(xx,yy))[1].values()[myBand] ## ignore if nodata or on the diagonal if not (zzstr==myNDV or zzstr==myOE or abs(ii)==abs(jj)) : zz=float(zzstr) myNumeratorMI = myNumeratorMI + (z-m)*(zz-m) myNumeratorGC = myNumeratorGC + pow(z-zz,2) myNumeratorCount+=1 myS0+=1 ## w(ij) = 1 myS1+=4 ## (w(ij) + w(ji))^2 = (1+1)^2 = 4 myS2_ct+=2 ## w(ij) + w(ji) # finish S2 running total by squaring the total adjacents myS2+=pow(myS2_ct,2) # now put numerator and denominator together if myDenominator==0 or myNumeratorCount==0: myMoranI=None myGearyC=None else: myMoranI= float(myN)/float(myNumeratorCount)*myNumeratorMI/myDenominator myGearyC= float(myN-1)/(2*float(myNumeratorCount))*myNumeratorGC/myDenominator # Stats for Moran's I # Expected value of Moran's I myE=-1*pow(myN-1,-1) # Finish S1 calculation myS1=myS1/2 # Variance of Moran's I Assuming Normality myVarianceMIAN=(((pow(myN,2)*myS1) - (myN*myS2) + (3*(pow(myS0,2)))) /\ (pow(myS0,2)*(pow(myN,2)-1))) - pow(myE,2) # Variance Moran's I Randomisation Version if myN>0 and myKDenom>0: myK = (myKNum / myN) / pow(myKDenom / myN,2) myVarianceMIRV = ((myN*((pow(myN,2)-(3*myN)+3)*myS1 - myN*myS2 + 3*pow(myS0,2)) \ - myK*((pow(myN,2)-myN)*myS1-2*myN*myS2+6*pow(myS0,2))) /\ ( (myN-1)*(myN-2)*(myN-3)*pow(myS0,2) )) - pow(myE,2) if myVarianceMIAN > 0: # Zscore for Moran's I assuming Normality myZMIAN = (myMoranI-myE) / pow(myVarianceMIAN,0.5) # P value that Moran's I shows no significant autocorrelation myPMIAN = 2*(1-zprob(myZMIAN)) if myVarianceMIRV > 0: # Zscore for Moran's I randomisation version myZMIRV = (myMoranI-myE) / pow(myVarianceMIRV,0.5) # P value that Moran's I shows no significant autocorrelation myPMIRV = 2*(1-zprob(myZMIRV)) # Variance, z-score & p value for Geary's C # Normality version myVarianceGCAN = ((((2*myS1)+myS2)*(myN-1)-(4*pow(myS0,2))))/(2*(myN+1)*pow(myS0,2)) if myVarianceGCAN>0: myZGCAN = -1*(myGearyC - 1) / pow(myVarianceGCAN,0.5) myPGCAN = 2*(1-zprob(myZGCAN)) # Now the random version myVarianceGCRV = ((((myN-1)*myS1)*((pow(myN,2)-(3*myN)+3-((myN-1)*myK)))) \ - ((((myN-1)*myS2)*((pow(myN,2)+(3*myN)-6-((pow(myN,2)-myN+ 2)*myK))))/4) \ + (pow(myS0,2)*(pow(myN,2)-3-(pow(myN-1,2)*myK)))) /\ ((myN*(myN-2)*(myN-3))*pow(myS0,2)) if myVarianceGCRV>0: myZGCRV = -1*(myGearyC - 1) / pow(myVarianceGCRV,0.5) myPGCRV = 2*(1-zprob(myZGCRV)) return [myMoranI, myVarianceMIAN,myZMIAN,myPMIAN, myVarianceMIRV,myZMIRV,myPMIRV, myGearyC, myVarianceGCAN,myZGCAN,myPGCAN, myVarianceGCRV,myZGCRV,myPGCRV]
mpl.figure(figsize=(args.figwidth, args.figheight)) slice = slice.dropna(how='any') priors = np.ones((n_pos, len(FPKM_cols))) / n_pos widgets = ['Time %s:'%ts, Percentage(), Bar(), ETA()] progress = ProgressBar(widgets=widgets) for gene in progress(slice.index): if gene not in frame.index: continue #if sum(np.isnan(slice.ix[gene])): # assert False # continue normed = (slice.ix[gene] / max(slice.ix[gene]) * np.mean(best_cycle.ix[gene], axis=1)) for i, col in enumerate(FPKM_cols): std = get_std(col, frame) evidence = stats.zprob(-np.abs((normed - frame[col][gene])/(std+1))) updated = bayes(priors[:,i], evidence) #assert not sum(np.isnan(updated)) priors[:,i] = updated my_cm = mpl.cm.__getattribute__(args.colormap) n_pos, n_samples = np.shape(priors) plots = [] for i in range(n_samples): plots.extend(mpl.plot(priors[:,i], label=FPKM_cols[i].replace('_FPKM', ''), color = my_cm(i * 256 / (n_samples-1)))) ax = mpl.gca() Y = priors.max() dY = 0.25 * Y Y += dY
def d_two(self): n_number = stats.zprob(self.calc_d_two()) return n_number
def delta(self): n_number = stats.zprob(self.calc_d_one()) return n_number
def zTransform(r, n): z = np.log((1 + r) / (1 - r)) * (np.sqrt(n - 3) / 2) p = zprob(-z) return p