示例#1
0
文件: test.py 项目: jdurbin/lokeycyto
def ks_2samp2(data1, data2):
    """ Computes the Kolmogorov-Smirnof statistic on 2 samples.
    """
    data1, data2 = map(asarray, (data1, data2))
    
    print data1
    
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    
    print n1,n2
    
    n1 = len(data1)
    n2 = len(data2)
    
    print n1,n2
    
    data1 = np.sort(data1)
    data2 = np.sort(data2)
    
    print data1
    
    data_all = np.concatenate([data1,data2])
    
    #print data_all
    
    cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1)
    
    print cdf1
    
    cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2)
    d = np.max(np.absolute(cdf1-cdf2))
    #Note: d absolute not signed distance
    en = np.sqrt(n1*n2/float(n1+n2))
    try:
        prob = ksprob((en+0.12+0.11/en)*d)
    except:
        prob = 1.0
    return d, prob
示例#2
0
def ks_2samp(data1, data2):
    """
    Computes the Kolmogorov-Smirnof statistic on 2 samples.

    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.

    TAK: taken from scipy.stats. Extended to return the value for which
    the KS-statistic is maximal.

    Parameters
    ----------
    a, b : sequence of 1-D ndarrays
        two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different


    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value
    x_D: float
        value of the control-variable for which the KS statistic is maximal


    Notes
    -----

    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample K-S test, the distribution is
    assumed to be continuous.

    This is the two-sided test, one-sided tests are not implemented.
    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

    If the K-S statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import ks_2samp

    >>> #fix random seed to get the same result
    >>> np.random.seed(12345678);

    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample

    different distribution
    we can reject the null hypothesis since the pvalue is below 1%

    >>> rvs1 = stats.norm.rvs(size=n1,loc=0.,scale=1);
    >>> rvs2 = stats.norm.rvs(size=n2,loc=0.5,scale=1.5)
    >>> ks_2samp(rvs1,rvs2)
    (0.20833333333333337, 4.6674975515806989e-005)

    slightly different distribution
    we cannot reject the null hypothesis at a 10% or lower alpha since
    the pvalue at 0.144 is higher than 10%

    >>> rvs3 = stats.norm.rvs(size=n2,loc=0.01,scale=1.0)
    >>> ks_2samp(rvs1,rvs3)
    (0.10333333333333333, 0.14498781825751686)

    identical distribution
    we cannot reject the null hypothesis since the pvalue is high, 41%

    >>> rvs4 = stats.norm.rvs(size=n2,loc=0.0,scale=1.0)
    >>> ks_2samp(rvs1,rvs4)
    (0.07999999999999996, 0.41126949729859719)

    """
    data1, data2 = map(np.asarray, (data1, data2))
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    n1 = len(data1)
    n2 = len(data2)
    data1 = np.sort(data1)
    data2 = np.sort(data2)
    data_all = np.concatenate([data1,data2])
    cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1)
    cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2)
    d = np.max(np.absolute(cdf1-cdf2))
    x_d = data_all[np.argmax(np.absolute(cdf1-cdf2))]
    #Note: d absolute not signed distance
    en = np.sqrt(n1*n2/float(n1+n2))
    try:
        prob = ksprob((en+0.12+0.11/en)*d)
    except:
        prob = 1.0
    return d, prob, x_d
示例#3
0
 def err(p, fp_ref):
     #print p,fp_ref
     #print "p, fp_ref:", p, fp_ref
     return abs(fp_ref - ksprob((en + 0.12 + 0.11 / en) * p[0]))
示例#4
0
print "cdfa:"
print cdfa
print "cdfb:"
print cdfb

print "cdfa-cdfb"
print cdfa - cdfb

d = np.max(np.absolute(cdfa-cdfb))
#Note: d absolute not signed distance

en = np.sqrt(na*nb/float(na+nb))

print en

try:
    #prob = ksprob((en+0.12+0.11/en)*d)
    prob = ksprob(d*en)
except:
    prob = 1.0

print d, prob


print ksprob(0.2)
print ksprob(0.4)
print ksprob(0.6)
print ksprob(0.8)

示例#5
0
 def err(p,fp_ref):
     #print p,fp_ref
     #print "p, fp_ref:", p, fp_ref
     return abs(fp_ref-ksprob((en+0.12+0.11/en)*p[0]))
示例#6
0
def ks_2samp(data1, data2):
    """
    Computes the Kolmogorov-Smirnof statistic on 2 samples.

    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.

    TAK: taken from scipy.stats. Extended to return the value for which
    the KS-statistic is maximal.

    Parameters
    ----------
    a, b : sequence of 1-D ndarrays
        two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different


    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value
    x_D: float
        value of the control-variable for which the KS statistic is maximal


    Notes
    -----

    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample K-S test, the distribution is
    assumed to be continuous.

    This is the two-sided test, one-sided tests are not implemented.
    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

    If the K-S statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import ks_2samp

    >>> #fix random seed to get the same result
    >>> np.random.seed(12345678);

    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample

    different distribution
    we can reject the null hypothesis since the pvalue is below 1%

    >>> rvs1 = stats.norm.rvs(size=n1,loc=0.,scale=1);
    >>> rvs2 = stats.norm.rvs(size=n2,loc=0.5,scale=1.5)
    >>> ks_2samp(rvs1,rvs2)
    (0.20833333333333337, 4.6674975515806989e-005)

    slightly different distribution
    we cannot reject the null hypothesis at a 10% or lower alpha since
    the pvalue at 0.144 is higher than 10%

    >>> rvs3 = stats.norm.rvs(size=n2,loc=0.01,scale=1.0)
    >>> ks_2samp(rvs1,rvs3)
    (0.10333333333333333, 0.14498781825751686)

    identical distribution
    we cannot reject the null hypothesis since the pvalue is high, 41%

    >>> rvs4 = stats.norm.rvs(size=n2,loc=0.0,scale=1.0)
    >>> ks_2samp(rvs1,rvs4)
    (0.07999999999999996, 0.41126949729859719)

    """
    data1, data2 = map(np.asarray, (data1, data2))
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    n1 = len(data1)
    n2 = len(data2)
    data1 = np.sort(data1)
    data2 = np.sort(data2)
    data_all = np.concatenate([data1, data2])
    cdf1 = np.searchsorted(data1, data_all, side="right") / (1.0 * n1)
    cdf2 = (np.searchsorted(data2, data_all, side="right")) / (1.0 * n2)
    d = np.max(np.absolute(cdf1 - cdf2))
    x_d = data_all[np.argmax(np.absolute(cdf1 - cdf2))]
    # Note: d absolute not signed distance
    en = np.sqrt(n1 * n2 / float(n1 + n2))
    try:
        prob = ksprob((en + 0.12 + 0.11 / en) * d)
    except:
        prob = 1.0
    return d, prob, x_d
def ks_2samp(data1, data2, signed=False):
    """
    Computes the Kolmogorov-Smirnov statistic on two samples, as well
    as its two-sided p-value.

    The two-sample Kolmogorov-Smirnov statistic is 

        D = sup_x |A(x) - B(x)|,

    where A and B are the empirical distribution functions of
    parameters `a` and `b`, respectively. This is a two-sided test for
    the null hypothesis that two independent samples are drawn from the
    same continuous distribution.

    Parameters
    ----------
    a, b : sequence of 1-D ndarrays
        Two arrays of sample observations assumed to be drawn from a continuous
        distribution. The sample sizes can be different.
    signed : {False, True}, optional
        This flag determines whether the returned value `D` should 
        be signed to indicate which distribution function is largest 
        at the point where the two distribution functions differ the most.
        When `signed` is true, `D` will be

            A(x) - B(x)   where   x = arg sup_x |A(x) - B(x)|.

        With the default value, False, `D` will be sup_x |A(X) - B(x)|
        and therefore always in [0, 1].

    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value


    Notes
    -----

    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample K-S test, the distribution is
    assumed to be continuous.

    This is the two-sided test, one-sided tests are not implemented.
    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

    If the K-S statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import ks_2samp

    >>> #fix random seed to get the same result
    >>> np.random.seed(12345678);

    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample

    different distribution
    we can reject the null hypothesis since the pvalue is below 1%

    >>> rvs1 = stats.norm.rvs(size=n1,loc=0.,scale=1);
    >>> rvs2 = stats.norm.rvs(size=n2,loc=0.5,scale=1.5)
    >>> ks_2samp(rvs1,rvs2)
    (0.20833333333333337, 4.6674975515806989e-005)

    slightly different distribution
    we cannot reject the null hypothesis at a 10% or lower alpha since
    the pvalue at 0.144 is higher than 10%

    >>> rvs3 = stats.norm.rvs(size=n2,loc=0.01,scale=1.0)
    >>> ks_2samp(rvs1,rvs3)
    (0.10333333333333333, 0.14498781825751686)

    identical distribution
    we cannot reject the null hypothesis since the pvalue is high, 41%

    >>> rvs4 = stats.norm.rvs(size=n2,loc=0.0,scale=1.0)
    >>> ks_2samp(rvs1,rvs4)
    (0.07999999999999996, 0.41126949729859719)

    When `signed` is true, the KS-statistic will be negative if the
    second distribution function is the largest at the point where the
    distribution functions differ the most.

    >>> ks_2samp(rvs1,rvs4,signed=True)
    (-0.07999999999999996, 0.41126949729859719)

    """
    data1, data2 = map(asarray, (data1, data2))
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    n1 = len(data1)
    n2 = len(data2)
    data1 = np.sort(data1)
    data2 = np.sort(data2)
    data_all = np.concatenate([data1,data2])
    cdf1 = np.searchsorted(data1,data_all,side='right')/(1.0*n1)
    cdf2 = (np.searchsorted(data2,data_all,side='right'))/(1.0*n2)
    diff = cdf1-cdf2
    ind = np.argmax(np.absolute(diff))
    d = diff[ind]
    absd = np.absolute(d)
    en = np.sqrt(n1*n2/float(n1+n2))
    try:
        prob = ksprob((en+0.12+0.11/en)*absd)
    except:
        prob = 1.0
    if signed:
        return d, prob
    else:
        return absd, prob
示例#8
0
def ks_2samp(data1, data2, signed=False):
    """
    Computes the Kolmogorov-Smirnov statistic on two samples, as well
    as its two-sided p-value.

    The two-sample Kolmogorov-Smirnov statistic is 

        D = sup_x |A(x) - B(x)|,

    where A and B are the empirical distribution functions of
    parameters `a` and `b`, respectively. This is a two-sided test for
    the null hypothesis that two independent samples are drawn from the
    same continuous distribution.

    Parameters
    ----------
    a, b : sequence of 1-D ndarrays
        Two arrays of sample observations assumed to be drawn from a continuous
        distribution. The sample sizes can be different.
    signed : {False, True}, optional
        This flag determines whether the returned value `D` should 
        be signed to indicate which distribution function is largest 
        at the point where the two distribution functions differ the most.
        When `signed` is true, `D` will be

            A(x) - B(x)   where   x = arg sup_x |A(x) - B(x)|.

        With the default value, False, `D` will be sup_x |A(X) - B(x)|
        and therefore always in [0, 1].

    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value


    Notes
    -----

    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample K-S test, the distribution is
    assumed to be continuous.

    This is the two-sided test, one-sided tests are not implemented.
    The test uses the two-sided asymptotic Kolmogorov-Smirnov distribution.

    If the K-S statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.

    Examples
    --------

    >>> from scipy import stats
    >>> import numpy as np
    >>> from scipy.stats import ks_2samp

    >>> #fix random seed to get the same result
    >>> np.random.seed(12345678);

    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample

    different distribution
    we can reject the null hypothesis since the pvalue is below 1%

    >>> rvs1 = stats.norm.rvs(size=n1,loc=0.,scale=1);
    >>> rvs2 = stats.norm.rvs(size=n2,loc=0.5,scale=1.5)
    >>> ks_2samp(rvs1,rvs2)
    (0.20833333333333337, 4.6674975515806989e-005)

    slightly different distribution
    we cannot reject the null hypothesis at a 10% or lower alpha since
    the pvalue at 0.144 is higher than 10%

    >>> rvs3 = stats.norm.rvs(size=n2,loc=0.01,scale=1.0)
    >>> ks_2samp(rvs1,rvs3)
    (0.10333333333333333, 0.14498781825751686)

    identical distribution
    we cannot reject the null hypothesis since the pvalue is high, 41%

    >>> rvs4 = stats.norm.rvs(size=n2,loc=0.0,scale=1.0)
    >>> ks_2samp(rvs1,rvs4)
    (0.07999999999999996, 0.41126949729859719)

    When `signed` is true, the KS-statistic will be negative if the
    second distribution function is the largest at the point where the
    distribution functions differ the most.

    >>> ks_2samp(rvs1,rvs4,signed=True)
    (-0.07999999999999996, 0.41126949729859719)

    """
    data1, data2 = map(asarray, (data1, data2))
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    n1 = len(data1)
    n2 = len(data2)
    data1 = np.sort(data1)
    data2 = np.sort(data2)
    data_all = np.concatenate([data1, data2])
    cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0 * n1)
    cdf2 = (np.searchsorted(data2, data_all, side='right')) / (1.0 * n2)
    diff = cdf1 - cdf2
    ind = np.argmax(np.absolute(diff))
    d = diff[ind]
    absd = np.absolute(d)
    en = np.sqrt(n1 * n2 / float(n1 + n2))
    try:
        prob = ksprob((en + 0.12 + 0.11 / en) * absd)
    except:
        prob = 1.0
    if signed:
        return d, prob
    else:
        return absd, prob