예제 #1
0
 def test_mad(self):
     """Test function mad."""
     a = [1.2, 3, 4.5, 2.4, 5, 6.7, 0.4]
     # Compare to Matlab
     assert mad(a, normalize=False) == 1.8
     assert np.round(mad(a), 3) == np.round(1.8 * 1.4826, 3)
예제 #2
0
 def test_mad(self):
     """Test function mad."""
     from scipy.stats import median_abs_deviation as mad_scp
     a = [1.2, 3, 4.5, 2.4, 5, 6.7, 0.4]
     # Compare to Matlab
     assert mad(a, normalize=False) == 1.8
     assert np.round(mad(a), 3) == np.round(1.8 * 1.4826, 3)
     # Axes handling -- Compare to SciPy
     assert np.allclose(mad_scp(w, scale='normal'), mad(w))  # Axis = 0
     assert np.allclose(mad_scp(w, scale='normal', axis=1), mad(w, axis=1))
     assert np.allclose(mad_scp(w, scale='normal', axis=None),
                        mad(w, axis=None))
     # Missing values
     # Note that in Scipy 1.3.0, mad(axis=0/1) does not work properly
     # if data contains NaN, even when passing (nan_policy='omit')
     wnan = w.copy()
     wnan[3, 2] = np.nan
     assert np.allclose(
         mad_scp(wnan, scale='normal', axis=None, nan_policy='omit'),
         mad(wnan, axis=None))
     assert mad(wnan, axis=0).size == wnan.shape[1]
     assert mad(wnan, axis=1).size == wnan.shape[0]
     # Now we make sure that `w` and `wnan` returns almost the same results,
     # i.e. except for the row/column with missing values
     assert np.allclose(mad(w, axis=None), mad(wnan, axis=None), atol=1e-02)
     assert sum(mad(w, axis=0) == mad(wnan, axis=0)) == 9
     assert sum(mad(w, axis=1) == mad(wnan, axis=1)) == 4
예제 #3
0
def skipped(x, y, method='spearman'):
    """
    Skipped correlation (Rousselet and Pernet 2012).

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. x and y must be independent.
    method : str
        Method used to compute the correlation after outlier removal. Can be
        either 'spearman' (default) or 'pearson'.

    Returns
    -------
    r : float
        Skipped correlation coefficient.
    pval : float
        Two-tailed p-value.
    outliers : array of bool
        Indicate if value is an outlier or not

    Notes
    -----
    The skipped correlation involves multivariate outlier detection using a
    projection technique (Wilcox, 2004, 2005). First, a robust estimator of
    multivariate location and scatter, for instance the minimum covariance
    determinant estimator (MCD; Rousseeuw, 1984; Rousseeuw and van Driessen,
    1999; Hubert et al., 2008) is computed. Second, data points are
    orthogonally projected on lines joining each of the data point to the
    location estimator. Third, outliers are detected using a robust technique.
    Finally, Spearman correlations are computed on the remaining data points
    and calculations are adjusted by taking into account the dependency among
    the remaining data points.

    Code inspired by Matlab code from Cyril Pernet and Guillaume
    Rousselet [1]_.

    Requires scikit-learn.

    References
    ----------

    .. [1] Pernet CR, Wilcox R, Rousselet GA. Robust Correlation Analyses:
       False Positive and Power Validation Using a New Open Source Matlab
       Toolbox. Frontiers in Psychology. 2012;3:606.
       doi:10.3389/fpsyg.2012.00606.
    """
    # Check that sklearn is installed
    from pingouin.utils import is_sklearn_installed
    is_sklearn_installed(raise_error=True)
    from scipy.stats import chi2
    from sklearn.covariance import MinCovDet
    X = np.column_stack((x, y))
    center = MinCovDet().fit(X).location_

    # Detect outliers based on robust covariance
    nrows, ncols = X.shape
    gval = np.sqrt(chi2.ppf(0.975, 2))

    # Loop over rows
    record = np.zeros(shape=(nrows, nrows))
    for i in np.arange(nrows):
        dis = np.zeros(nrows)
        B = (X[i, :] - center).T
        bot = np.sum(B**2)
        if bot != 0:
            for j in np.arange(nrows):
                A = X[j, :] - center
                dis[j] = np.linalg.norm(A * B / bot * B)

            # Apply the MAD median rule
            MAD = mad(dis)
            outliers = madmedianrule(dis)
            record[i, :] = dis > (np.median(dis) + gval * MAD)

    outliers = np.sum(record, axis=0) >= 1

    # Compute correlation on remaining data
    if method == 'spearman':
        r, pval = spearmanr(X[~outliers, 0], X[~outliers, 1])
    else:
        r, pval = pearsonr(X[~outliers, 0], X[~outliers, 1])
    return r, pval, outliers