예제 #1
0
 def test_perm_pval(self):
     """Test function _perm_pval.
     """
     np.random.seed(123)
     bootstat = np.random.normal(size=1000)
     x = -2
     up = _perm_pval(bootstat, x, alternative='greater')
     low = _perm_pval(bootstat, x, alternative='less')
     two = _perm_pval(bootstat, x, alternative='two-sided')
     assert up > low
     assert up + low == 1
     assert low < two < up
     x = 2.5
     up = _perm_pval(bootstat, x, alternative='greater')
     low = _perm_pval(bootstat, x, alternative='less')
     two = _perm_pval(bootstat, x, alternative='two-sided')
     assert low > up
     assert up + low == 1
     assert up < two < low
예제 #2
0
def distance_corr(x, y, tail='greater', n_boot=1000, seed=None):
    """Distance correlation between two arrays.

    Statistical significance (p-value) is evaluated with a permutation test.

    Parameters
    ----------
    x, y : np.ndarray
        1D or 2D input arrays, shape (n_samples, n_features).
        ``x`` and ``y`` must have the same number of samples and must not
        contain missing values.
    tail : str
        Tail for p-value. Can be either `'two-sided'` (default), or `'greater'`
        or `'less'` for directional tests. To be consistent
        with the original R implementation, the default is to calculate the
        one-sided `'greater'` p-value.
    n_boot : int or None
        Number of bootstrap to perform.
        If None, no bootstrapping is performed and the function
        only returns the distance correlation (no p-value).
        Default is 1000 (thus giving a precision of 0.001).
    seed : int or None
        Random state seed.

    Returns
    -------
    dcor : float
        Sample distance correlation (range from 0 to 1).
    pval : float
        P-value

    Notes
    -----
    From Wikipedia:

    *Distance correlation is a measure of dependence between two paired
    random vectors of arbitrary, not necessarily equal, dimension. The
    distance correlation coefficient is zero if and only if the random vectors
    are independent. Thus, distance correlation measures both linear and
    nonlinear association between two random variables or random vectors.
    This is in contrast to Pearson's correlation, which can only detect
    linear association between two random variables.*

    The distance correlation of two random variables is obtained by
    dividing their distance covariance by the product of their distance
    standard deviations:

    .. math::

        \\text{dCor}(X, Y) = \\frac{\\text{dCov}(X, Y)}
        {\\sqrt{\\text{dVar}(X) \\cdot \\text{dVar}(Y)}}

    where :math:`\\text{dCov}(X, Y)` is the square root of the arithmetic
    average of the product of the double-centered pairwise Euclidean distance
    matrices.

    Note that by contrast to Pearson's correlation, the distance correlation
    cannot be negative, i.e :math:`0 \\leq \\text{dCor} \\leq 1`.

    Results have been tested against the 'energy' R package.

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Distance_correlation

    .. [2] Székely, G. J., Rizzo, M. L., & Bakirov, N. K. (2007).
           Measuring and testing dependence by correlation of distances.
           The annals of statistics, 35(6), 2769-2794.

    .. [3] https://gist.github.com/satra/aa3d19a12b74e9ab7941

    .. [4] https://gist.github.com/wladston/c931b1495184fbb99bec

    .. [5] https://cran.r-project.org/web/packages/energy/energy.pdf

    Examples
    --------
    1. With two 1D vectors

    >>> from pingouin import distance_corr
    >>> a = [1, 2, 3, 4, 5]
    >>> b = [1, 2, 9, 4, 4]
    >>> distance_corr(a, b, seed=9)
    (0.7626762424168667, 0.312)

    2. With two 2D arrays and no p-value

    >>> import numpy as np
    >>> np.random.seed(123)
    >>> from pingouin import distance_corr
    >>> a = np.random.random((10, 10))
    >>> b = np.random.random((10, 10))
    >>> distance_corr(a, b, n_boot=None)
    0.8799633012275321
    """
    assert tail in ['greater', 'less', 'two-sided'], 'Wrong tail argument.'
    x = np.asarray(x)
    y = np.asarray(y)
    # Check for NaN values
    if any([np.isnan(np.min(x)), np.isnan(np.min(y))]):
        raise ValueError('Input arrays must not contain NaN values.')
    if x.ndim == 1:
        x = x[:, None]
    if y.ndim == 1:
        y = y[:, None]
    assert x.shape[0] == y.shape[0], 'x and y must have same number of samples'

    # Extract number of samples
    n = x.shape[0]
    n2 = n**2

    # Process first array to avoid redundancy when performing bootstrap
    a = squareform(pdist(x, metric='euclidean'))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    dcov2_xx = np.vdot(A, A) / n2

    # Process second array and compute final distance correlation
    dcor = _dcorr(y, n2, A, dcov2_xx)

    # Compute one-sided p-value using a bootstrap procedure
    if n_boot is not None and n_boot > 1:
        # Define random seed and permutation
        rng = np.random.RandomState(seed)
        bootsam = rng.random_sample((n_boot, n)).argsort(axis=1)
        bootstat = np.empty(n_boot)
        for i in range(n_boot):
            bootstat[i] = _dcorr(y[bootsam[i, :]], n2, A, dcov2_xx)

        pval = _perm_pval(bootstat, dcor, tail=tail)
        return dcor, pval
    else:
        return dcor