def wilcoxon(x,y=None): """ Calculates the Wilcoxon signed-rank test for the null hypothesis that two samples come from the same distribution. A non-parametric T-test. (need N > 20) Returns: t-statistic, two-tailed p-value """ if y is None: d = x else: x, y = map(asarray, (x, y)) if len(x) <> len(y): raise ValueError, 'Unequal N in wilcoxon. Aborting.' d = x-y d = compress(not_equal(d,0),d) # Keep all non-zero differences count = len(d) if (count < 10): print "Warning: sample size too small for normal approximation." r = stats.rankdata(abs(d)) r_plus = sum((d > 0)*r) r_minus = sum((d < 0)*r) T = min(r_plus, r_minus) mn = count*(count+1.0)*0.25 se = math.sqrt(count*(count+1)*(2*count+1.0)/24) if (len(r) != len(unique(r))): # handle ties in data replist, repnum = find_repeats(r) corr = 0.0 for i in range(len(replist)): si = repnum[i] corr += 0.5*si*(si*si-1.0) V = se*se - corr se = sqrt((count*V - T*T)/(count-1.0)) z = (T - mn)/se prob = 2*(1.0 -stats.zprob(abs(z))) return T, prob
def _apply_func(x,g,func): # g is list of indices into x # separating x into different groups # func should be applied over the groups g = unique(r_[0,g,len(x)]) output = [] for k in range(len(g)-1): output.append(func(x[g[k]:g[k+1]])) return asarray(output)
def ansari(x,y): """Determine if the scale parameter for two distributions with equal medians is the same using the Ansari-Bradley statistic. Specifically, compute the AB statistic and the probability of error that the null hypothesis is true but rejected with the computed statistic as the critical value. One can reject the null hypothesis that the ratio of variances is 1 if returned probability of error is small (say < 0.05) """ x,y = asarray(x),asarray(y) n = len(x) m = len(y) if (m < 1): raise ValueError, "Not enough other observations." if (n < 1): raise ValueError, "Not enough test observations." N = m+n xy = r_[x,y] # combine rank = stats.rankdata(xy) symrank = amin(array((rank,N-rank+1)),0) AB = sum(symrank[:n]) uxy = unique(xy) repeats = (len(uxy) != len(xy)) exact = ((m<55) and (n<55) and not repeats) if repeats and ((m < 55) or (n < 55)): print "Ties preclude use of exact statistic." if exact: astart, a1, ifault = statlib.gscale(n,m) ind = AB-astart total = sum(a1) if ind < len(a1)/2.0: cind = int(ceil(ind)) if (ind == cind): pval = 2.0*sum(a1[:cind+1])/total else: pval = 2.0*sum(a1[:cind])/total else: find = int(floor(ind)) if (ind == floor(ind)): pval = 2.0*sum(a1[find:])/total else: pval = 2.0*sum(a1[find+1:])/total return AB, min(1.0,pval) # otherwise compute normal approximation if N % 2: # N odd mnAB = n*(N+1.0)**2 / 4.0 / N varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2) else: mnAB = n*(N+2.0)/4.0 varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0) if repeats: # adjust variance estimates # compute sum(tj * rj**2) fac = sum(symrank**2) if N % 2: # N odd varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1)) else: # N even varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1)) z = (AB - mnAB)/sqrt(varAB) pval = (1-distributions.norm.cdf(abs(z)))*2.0 return AB, pval