Пример #1
0
def spearman_rho(m, n):
    """
    Return Spearman's rho; based off stats.py

    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> print round(spearman_rho(x, y), 3)
    0.936
    """
    assert len(m) == len(n)
    dsq = sum([(mi - ni) ** 2 for (mi, ni) in zip(rank(m), rank(n))])
    return 1. - 6. * dsq / float(len(m) * (len(n) ** 2 - 1.))
Пример #2
0
def mann_whitney_U(n1, n2):
    """
    Wilcoxan (Mann-Whitney/rank sum) U test, as generalized in:

    H.B. Mann & D.R. Whitney. 1947. On a test of whether one of two random
    variables is stochastically larger than the other. Annals of
    Mathematical Statistics 18(1): 50-60.

    NB: no adjustments are made for ties; these are both fine if your
    samples are large and aren't ordinal (or small counts)

    >>> from csv import DictReader
    >>> from collections import defaultdict
    >>> species2petal_width = defaultdict(list)
    >>> for row in DictReader(open('iris.csv', 'r')):
    ...     species = row['Species']
    ...     width = float(row['Sepal.Width'])
    ...     species2petal_width[species].append(width)
    >>> U_results = mann_whitney_U(species2petal_width['versicolor'],
    ...                            species2petal_width['virginica'])
    >>> U_results['U']
    841.0
    >>> round(U_results['p.U'], 4)
    0.0045
    """
    l1 = len(n1)
    l2 = len(n2)
    rank_sum = sum(rank(n1 + n2)[:len(n1)])
    prod = l1 * l2
    u1 = rank_sum - l1 * (l1 + 1) / 2.
    u2 = prod - u1
    (U, bigU) = sorted((u1, u2))
    p = 2. * p_U(U, l1, l2)
    return {'U': U, 'p.U': p}
Пример #3
0
def spearman_rho_tr(m, n):
    """
    rho for tied ranks, checked by comparison with Pycluster

    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> print round(spearman_rho_tr(x, y), 3)
    0.935
    """
    assert len(m) == len(n), 'args must be the same length'
    m = rank(m)
    n = rank(n)
    num = 0.
    den_m = 0.
    den_n = 0.
    m_mean = mean(m)
    n_mean = mean(n)
    for (i, j) in zip(m, n):
        i = i - m_mean
        j = j - n_mean
        num += i * j
        den_m += i ** 2
        den_n += j ** 2
    return num / sqrt(den_m * den_n)