def spearman_rho(m, n): """ Return Spearman's rho; based off stats.py >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4] >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3] >>> print round(spearman_rho(x, y), 3) 0.936 """ assert len(m) == len(n) dsq = sum([(mi - ni) ** 2 for (mi, ni) in zip(rank(m), rank(n))]) return 1. - 6. * dsq / float(len(m) * (len(n) ** 2 - 1.))
def mann_whitney_U(n1, n2): """ Wilcoxan (Mann-Whitney/rank sum) U test, as generalized in: H.B. Mann & D.R. Whitney. 1947. On a test of whether one of two random variables is stochastically larger than the other. Annals of Mathematical Statistics 18(1): 50-60. NB: no adjustments are made for ties; these are both fine if your samples are large and aren't ordinal (or small counts) >>> from csv import DictReader >>> from collections import defaultdict >>> species2petal_width = defaultdict(list) >>> for row in DictReader(open('iris.csv', 'r')): ... species = row['Species'] ... width = float(row['Sepal.Width']) ... species2petal_width[species].append(width) >>> U_results = mann_whitney_U(species2petal_width['versicolor'], ... species2petal_width['virginica']) >>> U_results['U'] 841.0 >>> round(U_results['p.U'], 4) 0.0045 """ l1 = len(n1) l2 = len(n2) rank_sum = sum(rank(n1 + n2)[:len(n1)]) prod = l1 * l2 u1 = rank_sum - l1 * (l1 + 1) / 2. u2 = prod - u1 (U, bigU) = sorted((u1, u2)) p = 2. * p_U(U, l1, l2) return {'U': U, 'p.U': p}
def spearman_rho_tr(m, n): """ rho for tied ranks, checked by comparison with Pycluster >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4] >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3] >>> print round(spearman_rho_tr(x, y), 3) 0.935 """ assert len(m) == len(n), 'args must be the same length' m = rank(m) n = rank(n) num = 0. den_m = 0. den_n = 0. m_mean = mean(m) n_mean = mean(n) for (i, j) in zip(m, n): i = i - m_mean j = j - n_mean num += i * j den_m += i ** 2 den_n += j ** 2 return num / sqrt(den_m * den_n)