示例#1
0
def ttest_ind(a, b, axis=0):
    """Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores
    a, and b.  From Numerical Recipies, p.483. Axis can equal None (ravel
    array first), or an integer (the axis over which to operate on a and b).

    Returns: t-value, two-tailed p-value
    """


    a, b, axis = _chk2_asarray(a, b, axis)

    x1 = np.average( a, axis)
    x2 = np.average( b, axis)
    v1 = a.var(axis)
    v2 = b.var(axis)

    if np.ma.getmask(a).any():
        n1 = a.shape[axis] - a.mask.sum(axis)
    else:
        n1 = a.shape[axis]

    if np.ma.getmask(b).any():
        n2 = b.shape[axis] - b.mask.sum(axis)
    else:
        n2 = b.shape[axis]



    df = n1+n2-2
    svar = ((n1-1.0)*v1+(n2-1.0)*v2) / df
    zerodivproblem = svar == 0
    t = (x1-x2)/np.sqrt(svar*(1.0/n1 + 1.0/n2))  # N-D COMPUTATION HERE!!!!!!

    t = np.where(zerodivproblem, 1.0, t)           # replace NaN t-values with 1.0
    probs = stats.betai(0.5*df, 0.5, df /(df+t*t))

    if not np.isscalar(t):
        probs = probs.reshape(t.shape)
    if not np.isscalar(probs) and len(probs) == 1:
        probs = probs[0]
    return t, probs
示例#2
0
def ttest_ind(a, b, axis=0):
    """Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores
    a, and b.  From Numerical Recipies, p.483. Axis can equal None (ravel
    array first), or an integer (the axis over which to operate on a and b).

    Returns: t-value, two-tailed p-value
    """

    a, b, axis = _chk2_asarray(a, b, axis)

    x1 = np.average(a, axis)
    x2 = np.average(b, axis)
    v1 = a.var(axis)
    v2 = b.var(axis)

    if np.ma.getmask(a).any():
        n1 = a.shape[axis] - a.mask.sum(axis)
    else:
        n1 = a.shape[axis]

    if np.ma.getmask(b).any():
        n2 = b.shape[axis] - b.mask.sum(axis)
    else:
        n2 = b.shape[axis]

    df = n1 + n2 - 2
    svar = ((n1 - 1.0) * v1 + (n2 - 1.0) * v2) / df
    zerodivproblem = svar == 0
    t = (x1 - x2) / np.sqrt(
        svar * (1.0 / n1 + 1.0 / n2))  # N-D COMPUTATION HERE!!!!!!

    t = np.where(zerodivproblem, 1.0, t)  # replace NaN t-values with 1.0
    probs = stats.betai(0.5 * df, 0.5, df / (df + t * t))

    if not np.isscalar(t):
        probs = probs.reshape(t.shape)
    if not np.isscalar(probs) and len(probs) == 1:
        probs = probs[0]
    return t, probs
示例#3
0
def approxrand(a, b, **kwargs):
    """
    Returns an approximate significance level between two lists of
    independently generated test values.
    
    Approximate randomization calculates significance by randomly drawing
    from a sample of the possible permutations. At the limit of the number
    of possible permutations, the significance level is exact. The
    approximate significance level is the sample mean number of times the
    statistic of the permutated lists varies from the actual statistic of
    the unpermuted argument lists.
    
    @return: a tuple containing an approximate significance level, the count
             of the number of times the pseudo-statistic varied from the
             actual statistic, and the number of shuffles
    @rtype: C{tuple}
    @param a: a list of test values
    @type a: C{list}
    @param b: another list of independently generated test values
    @type b: C{list}
    """
    shuffles = kwargs.get('shuffles', 999)
    # there's no point in trying to shuffle beyond all possible permutations
    shuffles = \
        min(shuffles, reduce(lambda x, y: x * y, xrange(1, len(a) + len(b) + 1)))
    stat = kwargs.get('statistic', lambda lst: float(sum(lst)) / len(lst))
    verbose = kwargs.get('verbose', False)
    
    if verbose:
        print 'shuffles: %d' % shuffles
    
    actual_stat = math.fabs(stat(a) - stat(b))
    
    if verbose:
        print 'actual statistic: %f' % actual_stat
        print '-' * 60
    
    c = 1e-100
    lst = LazyConcatenation([a, b])
    indices = range(len(a) + len(b))

    for i in range(shuffles):
        if verbose and i % 10 == 0:
            print 'shuffle: %d' % i

        random.shuffle(indices)
        
        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
        pseudo_stat = math.fabs(pseudo_stat_a - pseudo_stat_b)
        
        if pseudo_stat >= actual_stat:
            c += 1
            
        if verbose and i % 10 == 0:
            print 'pseudo-statistic: %f' % pseudo_stat
            print 'significance: %f' % (float(c + 1) / (i + 1))
            print '-' * 60
        
    significance = float(c + 1) / (shuffles + 1)
    
    if verbose:
        print 'significance: %f' % significance        
        if betai:
            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                print "prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi))
    
    return (significance, c, shuffles)
示例#4
0
文件: scores.py 项目: Joselin/nltk
def approxrand(a, b, **kwargs):
    """
    Returns an approximate significance level between two lists of
    independently generated test values.

    Approximate randomization calculates significance by randomly drawing
    from a sample of the possible permutations. At the limit of the number
    of possible permutations, the significance level is exact. The
    approximate significance level is the sample mean number of times the
    statistic of the permutated lists varies from the actual statistic of
    the unpermuted argument lists.

    :return: a tuple containing an approximate significance level, the count
             of the number of times the pseudo-statistic varied from the
             actual statistic, and the number of shuffles
    :rtype: tuple
    :param a: a list of test values
    :type a: list
    :param b: another list of independently generated test values
    :type b: list
    """
    shuffles = kwargs.get('shuffles', 999)
    # there's no point in trying to shuffle beyond all possible permutations
    shuffles = \
        min(shuffles, reduce(lambda x, y: x * y, xrange(1, len(a) + len(b) + 1)))
    stat = kwargs.get('statistic', lambda lst: float(sum(lst)) / len(lst))
    verbose = kwargs.get('verbose', False)

    if verbose:
        print 'shuffles: %d' % shuffles

    actual_stat = fabs(stat(a) - stat(b))

    if verbose:
        print 'actual statistic: %f' % actual_stat
        print '-' * 60

    c = 1e-100
    lst = LazyConcatenation([a, b])
    indices = range(len(a) + len(b))

    for i in range(shuffles):
        if verbose and i % 10 == 0:
            print 'shuffle: %d' % i

        shuffle(indices)

        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)]))
        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):]))
        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)

        if pseudo_stat >= actual_stat:
            c += 1

        if verbose and i % 10 == 0:
            print 'pseudo-statistic: %f' % pseudo_stat
            print 'significance: %f' % (float(c + 1) / (i + 1))
            print '-' * 60

    significance = float(c + 1) / (shuffles + 1)

    if verbose:
        print 'significance: %f' % significance
        if betai:
            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                print "prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi))

    return (significance, c, shuffles)
示例#5
0
def pearsonr(x, y, dof=None):
    """
    Calculates a Pearson correlation coefficient and the p-value for testing
    non-correlation.

    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Like other correlation
    coefficients, this one varies between -1 and +1 with 0 implying no
    correlation. Correlations of -1 or +1 imply an exact linear
    relationship. Positive correlations imply that as x increases, so does
    y. Negative correlations imply that as x increases, y decreases.

    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Pearson correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.
    
    This is a modified version that supports an optional argument to set the
    degrees of freedom (dof) manually.

    Parameters
    ----------
    x : (N,) array_like
        Input
    y : (N,) array_like
        Input
    dof : int or None, optional
          Input

    Returns
    -------
    (Pearson's correlation coefficient,
     2-tailed p-value)

    References
    ----------
    http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation

    """
    # x and y should have same length.
    x = np.asarray(x)
    y = np.asarray(y)
    n = len(x)
    mx = x.mean()
    my = y.mean()
    xm, ym = x-mx, y-my
    r_num = np.add.reduce(xm * ym)
    r_den = np.sqrt(ss(xm) * ss(ym))
    r = r_num / r_den

    # Presumably, if abs(r) > 1, then it is only some small artifact of floating
    # point arithmetic.
    r = max(min(r, 1.0), -1.0)
    df = n-2 if dof is None else dof
    if abs(r) == 1.0:
        prob = 0.0
    else:
        t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
        prob = betai(0.5*df, 0.5, df / (df + t_squared))
    return r, prob
示例#6
0
def approxrand(a, b, **kwargs):
    """
    Returns an approximate significance level between two lists of
    independently generated test values.

    Approximate randomization calculates significance by randomly drawing
    from a sample of the possible permutations. At the limit of the number
    of possible permutations, the significance level is exact. The
    approximate significance level is the sample mean number of times the
    statistic of the permutated lists varies from the actual statistic of
    the unpermuted argument lists.

    :return: a tuple containing an approximate significance level, the count
             of the number of times the pseudo-statistic varied from the
             actual statistic, and the number of shuffles
    :rtype: tuple
    :param a: a list of test values
    :type a: list
    :param b: another list of independently generated test values
    :type b: list
    """
    shuffles = kwargs.get("shuffles", 999)
    # there's no point in trying to shuffle beyond all possible permutations
    shuffles = min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
    stat = kwargs.get("statistic", lambda lst: sum(lst) / len(lst))
    verbose = kwargs.get("verbose", False)

    if verbose:
        print("shuffles: %d" % shuffles)

    actual_stat = fabs(stat(a) - stat(b))

    if verbose:
        print("actual statistic: %f" % actual_stat)
        print("-" * 60)

    c = 1e-100
    lst = LazyConcatenation([a, b])
    indices = list(range(len(a) + len(b)))

    for i in range(shuffles):
        if verbose and i % 10 == 0:
            print("shuffle: %d" % i)

        shuffle(indices)

        pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[: len(a)]))
        pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a) :]))
        pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b)

        if pseudo_stat >= actual_stat:
            c += 1

        if verbose and i % 10 == 0:
            print("pseudo-statistic: %f" % pseudo_stat)
            print("significance: %f" % ((c + 1) / (i + 1)))
            print("-" * 60)

    significance = (c + 1) / (shuffles + 1)

    if verbose:
        print("significance: %f" % significance)
        if betai:
            for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]:
                print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi)))

    return (significance, c, shuffles)