Exemplo n.º 1
0
def compute_effsize(x, y, paired=False, eftype='cohen'):
    """Calculate effect size between two set of observations.

    Parameters
    ----------
    x : np.array or list
        First set of observations.
    y : np.array or list
        Second set of observations.
    paired : boolean
        If True, uses Cohen d-avg formula to correct for repeated measurements
        (see Notes).
    eftype : string
        Desired output effect size.
        Available methods are:

        * ``'none'``: no effect size
        * ``'cohen'``: Unbiased Cohen d
        * ``'hedges'``: Hedges g
        * ``'glass'``: Glass delta
        * ``'r'``: correlation coefficient
        * ``'eta-square'``: Eta-square
        * ``'odds-ratio'``: Odds ratio
        * ``'AUC'``: Area Under the Curve
        * ``'CLES'``: Common Language Effect Size

    Returns
    -------
    ef : float
        Effect size

    See Also
    --------
    convert_effsize : Conversion between effect sizes.
    compute_effsize_from_t : Convert a T-statistic to an effect size.

    Notes
    -----
    Missing values are automatically removed from the data. If ``x`` and ``y``
    are paired, the entire row is removed.

    If ``x`` and ``y`` are independent, the Cohen :math:`d` is:

    .. math::

        d = \\frac{\\overline{X} - \\overline{Y}}
        {\\sqrt{\\frac{(n_{1} - 1)\\sigma_{1}^{2} + (n_{2} - 1)
        \\sigma_{2}^{2}}{n1 + n2 - 2}}}

    If ``x`` and ``y`` are paired, the Cohen :math:`d_{avg}` is computed:

    .. math::

        d_{avg} = \\frac{\\overline{X} - \\overline{Y}}
        {\\sqrt{\\frac{(\\sigma_1^2 + \\sigma_2^2)}{2}}}

    The Cohen’s d is a biased estimate of the population effect size,
    especially for small samples (n < 20). It is often preferable
    to use the corrected Hedges :math:`g` instead:

    .. math:: g = d \\times (1 - \\frac{3}{4(n_1 + n_2) - 9})

    The Glass :math:`\\delta` is calculated using the group with the lowest
    variance as the control group:

    .. math::

        \\delta = \\frac{\\overline{X} -
        \\overline{Y}}{\\sigma^2_{\\text{control}}}

    The common language effect size is the proportion of pairs where ``x`` is
    higher than ``y`` (calculated with a brute-force approach where
    each observation of ``x`` is paired to each observation of ``y``,
    see :py:func:`pingouin.wilcoxon` for more details):

    .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y)

    For other effect sizes, Pingouin will first calculate a Cohen :math:`d` and
    then use the :py:func:`pingouin.convert_effsize` to convert to the desired
    effect size.

    References
    ----------
    * Lakens, D., 2013. Calculating and reporting effect sizes to
      facilitate cumulative science: a practical primer for t-tests and
      ANOVAs. Front. Psychol. 4, 863. https://doi.org/10.3389/fpsyg.2013.00863

    * Cumming, Geoff. Understanding the new statistics: Effect sizes,
      confidence intervals, and meta-analysis. Routledge, 2013.

    * https://osf.io/vbdah/

    Examples
    --------
    1. Cohen d from two independent samples.

    >>> import numpy as np
    >>> import pingouin as pg
    >>> x = [1, 2, 3, 4]
    >>> y = [3, 4, 5, 6, 7]
    >>> pg.compute_effsize(x, y, paired=False, eftype='cohen')
    -1.707825127659933

    The sign of the Cohen d will be opposite if we reverse the order of
    ``x`` and ``y``:

    >>> pg.compute_effsize(y, x, paired=False, eftype='cohen')
    1.707825127659933

    2. Hedges g from two paired samples.

    >>> x = [1, 2, 3, 4, 5, 6, 7]
    >>> y = [1, 3, 5, 7, 9, 11, 13]
    >>> pg.compute_effsize(x, y, paired=True, eftype='hedges')
    -0.8222477210374874

    3. Glass delta from two independent samples. The group with the lowest
    variance will automatically be selected as the control.

    >>> pg.compute_effsize(x, y, paired=False, eftype='glass')
    -1.3887301496588271

    4. Common Language Effect Size.

    >>> pg.compute_effsize(x, y, eftype='cles')
    0.2857142857142857

    In other words, there are ~29% of pairs where ``x`` is higher than ``y``,
    which means that there are ~71% of pairs where ``x`` is *lower* than ``y``.
    This can be easily verified by changing the order of ``x`` and ``y``:

    >>> pg.compute_effsize(y, x, eftype='cles')
    0.7142857142857143
    """
    # Check arguments
    if not _check_eftype(eftype):
        err = "Could not interpret input '{}'".format(eftype)
        raise ValueError(err)

    x = np.asarray(x)
    y = np.asarray(y)

    if x.size != y.size and paired:
        warnings.warn("x and y have unequal sizes. Switching to "
                      "paired == False.")
        paired = False

    # Remove rows with missing values
    x, y = remove_na(x, y, paired=paired)
    nx, ny = x.size, y.size

    if ny == 1:
        # Case 1: One-sample Test
        d = (x.mean() - y) / x.std(ddof=1)
        return d
    if eftype.lower() == 'glass':
        # Find group with lowest variance
        sd_control = np.min([x.std(ddof=1), y.std(ddof=1)])
        d = (x.mean() - y.mean()) / sd_control
        return d
    elif eftype.lower() == 'r':
        # Return correlation coefficient (useful for CI bootstrapping)
        from scipy.stats import pearsonr
        r, _ = pearsonr(x, y)
        return r
    elif eftype.lower() == 'cles':
        # Compute exact CLES (see pingouin.wilcoxon)
        diff = x[:, None] - y
        return np.where(diff == 0, 0.5, diff > 0).mean()
    else:
        # Test equality of variance of data with a stringent threshold
        # equal_var, p = homoscedasticity(x, y, alpha=.001)
        # if not equal_var:
        #     print('Unequal variances (p<.001). You should report',
        #           'Glass delta instead.')

        # Compute unbiased Cohen's d effect size
        if not paired:
            # https://en.wikipedia.org/wiki/Effect_size
            dof = nx + ny - 2
            poolsd = np.sqrt(
                ((nx - 1) * x.var(ddof=1) + (ny - 1) * y.var(ddof=1)) / dof)
            d = (x.mean() - y.mean()) / poolsd
        else:
            # Report Cohen d-avg (Cumming 2012; Lakens 2013)
            # Careful, the formula in Lakens 2013 is wrong. Updated in Pingouin
            # v0.3.4 to use the formula provided by Cummings 2012.
            # Before that the denominator was just (SD1 + SD2) / 2
            d = (x.mean() - y.mean()) / np.sqrt(
                (x.var(ddof=1) + y.var(ddof=1)) / 2)
        return convert_effsize(d, 'cohen', eftype, nx=nx, ny=ny)
Exemplo n.º 2
0
def corr(x, y, tail='two-sided', method='pearson'):
    """(Robust) correlation between two variables.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. x and y must be independent.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.
    method : string
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::

        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'kendall' : Kendall’s tau (ordinal data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)

    Returns
    -------
    stats : pandas DataFrame
        Test summary ::

        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'p-val' : one or two tailed p-value
        'BF10' : Bayes Factor of the alternative hypothesis (Pearson only)
        'power' : achieved power of the test (= 1 - type II error).

    See also
    --------
    pairwise_corr : Pairwise correlation between columns of a pandas DataFrame
    partial_corr : Partial correlation

    Notes
    -----
    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Correlations of -1 or +1 imply
    an exact linear relationship.

    The Spearman correlation is a nonparametric measure of the monotonicity of
    the relationship between two datasets. Unlike the Pearson correlation,
    the Spearman correlation does not assume that both datasets are normally
    distributed. Correlations of -1 or +1 imply an exact monotonic
    relationship.

    Kendall’s tau is a measure of the correspondence between two rankings.
    Values close to 1 indicate strong agreement, values close to -1 indicate
    strong disagreement.

    The percentage bend correlation [1]_ is a robust method that
    protects against univariate outliers.

    The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust
    methods that returns the Spearman's rho after bivariate outliers removal.
    Note that the skipped correlation requires that the scikit-learn
    package is installed (for computing the minimum covariance determinant).

    Please note that rows with NaN are automatically removed.

    If ``method='pearson'``, The Bayes Factor is calculated using the
    :py:func:`pingouin.bayesfactor_pearson` function.

    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395

    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200

    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119

    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606

    Examples
    --------
    1. Pearson correlation

    >>> import numpy as np
    >>> # Generate random correlated samples
    >>> np.random.seed(123)
    >>> mean, cov = [4, 6], [(1, .5), (.5, 1)]
    >>> x, y = np.random.multivariate_normal(mean, cov, 30).T
    >>> # Compute Pearson correlation
    >>> from pingouin import corr
    >>> corr(x, y)
              n      r         CI95%     r2  adj_r2     p-val  BF10  power
    pearson  30  0.491  [0.16, 0.72]  0.242   0.185  0.005813  8.55  0.809

    2. Pearson correlation with two outliers

    >>> x[3], y[5] = 12, -8
    >>> corr(x, y)
              n      r          CI95%     r2  adj_r2     p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439148  0.302  0.121

    3. Spearman correlation

    >>> corr(x, y, method="spearman")
               n      r         CI95%     r2  adj_r2     p-val  power
    spearman  30  0.401  [0.05, 0.67]  0.161   0.099  0.028034   0.61

    4. Percentage bend correlation (robust)

    >>> corr(x, y, method='percbend')
               n      r         CI95%     r2  adj_r2     p-val  power
    percbend  30  0.389  [0.03, 0.66]  0.151   0.089  0.033508  0.581

    5. Shepherd's pi correlation (robust)

    >>> corr(x, y, method='shepherd')
               n  outliers      r         CI95%     r2  adj_r2     p-val  power
    shepherd  30         2  0.437  [0.09, 0.69]  0.191   0.131  0.020128  0.694

    6. Skipped spearman correlation (robust)

    >>> corr(x, y, method='skipped')
              n  outliers      r         CI95%     r2  adj_r2     p-val  power
    skipped  30         2  0.437  [0.09, 0.69]  0.191   0.131  0.020128  0.694

    7. One-tailed Pearson correlation

    >>> corr(x, y, tail="one-sided", method='pearson')
              n      r          CI95%     r2  adj_r2     p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.219574  0.467  0.194

    8. Using columns of a pandas dataframe

    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': x, 'y': y})
    >>> corr(data['x'], data['y'])
              n      r          CI95%     r2  adj_r2     p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439148  0.302  0.121
    """
    x = np.asarray(x)
    y = np.asarray(y)

    # Check size
    if x.size != y.size:
        raise ValueError('x and y must have the same length.')

    # Remove NA
    x, y = remove_na(x, y, paired=True)
    nx = x.size

    # Compute correlation coefficient
    if method == 'pearson':
        r, pval = pearsonr(x, y)
    elif method == 'spearman':
        r, pval = spearmanr(x, y)
    elif method == 'kendall':
        r, pval = kendalltau(x, y)
    elif method == 'percbend':
        r, pval = percbend(x, y)
    elif method == 'shepherd':
        r, pval, outliers = shepherd(x, y)
    elif method == 'skipped':
        r, pval, outliers = skipped(x, y, method='spearman')
    else:
        raise ValueError('Method not recognized.')

    assert not np.isnan(r), 'Correlation returned NaN. Check your data.'

    # Compute r2 and adj_r2
    r2 = r**2
    adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3))

    # Compute the parametric 95% confidence interval and power
    if r2 < 1:
        ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r')
        pr = round(power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail), 3)
    else:
        ci = [1., 1.]
        pr = np.inf

    # Create dictionnary
    stats = {
        'n': nx,
        'r': round(r, 3),
        'r2': round(r2, 3),
        'adj_r2': round(adj_r2, 3),
        'CI95%': [ci],
        'p-val': pval if tail == 'two-sided' else .5 * pval,
        'power': pr
    }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = sum(outliers)

    # Compute the BF10 for Pearson correlation only
    if method == 'pearson':
        if r2 < 1:
            stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail)
        else:
            stats['BF10'] = str(np.inf)

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = [
        'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power'
    ]
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return stats[col_order]
Exemplo n.º 3
0
def multivariate_ttest(X, Y=None, paired=False):
    """Hotelling T-squared test (= multivariate T-test)

    Parameters
    ----------
    X : np.array
        First data matrix of shape (n_samples, n_features).
    Y : np.array or None
        Second data matrix of shape (n_samples, n_features). If ``Y`` is a 1D
        array of shape (n_features), a one-sample test is performed where the
        null hypothesis is defined in ``Y``. If ``Y`` is None, a one-sample
        is performed against np.zeros(n_features).
    paired : boolean
        Specify whether the two observations are related (i.e. repeated
        measures) or independent. If ``paired`` is True, ``X`` and ``Y`` must
        have exactly the same shape.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'T2'``: T-squared value
        * ``'F'``: F-value
        * ``'df1'``: first degree of freedom
        * ``'df2'``: second degree of freedom
        * ``'p-val'``: p-value

    See Also
    --------
    multivariate_normality : Multivariate normality test.
    ttest : Univariate T-test.

    Notes
    -----
    The Hotelling 's T-squared test [1]_ is the multivariate counterpart of
    the T-test.

    Rows with missing values are automatically removed using the
    :py:func:`remove_na` function.

    Tested against the `Hotelling
    <https://cran.r-project.org/web/packages/Hotelling/Hotelling.pdf>`_ R
    package.

    References
    ----------
    .. [1] Hotelling, H. The Generalization of Student's Ratio. Ann. Math.
           Statist. 2 (1931), no. 3, 360--378.

    See also http://www.real-statistics.com/multivariate-statistics/

    Examples
    --------
    Two-sample independent Hotelling T-squared test

    >>> import pingouin as pg
    >>> data = pg.read_dataset('multivariate')
    >>> dvs = ['Fever', 'Pressure', 'Aches']
    >>> X = data[data['Condition'] == 'Drug'][dvs]
    >>> Y = data[data['Condition'] == 'Placebo'][dvs]
    >>> pg.multivariate_ttest(X, Y)
                     T2         F  df1  df2      pval
    hotelling  4.228679  1.326644    3   32  0.282898

    Two-sample paired Hotelling T-squared test

    >>> pg.multivariate_ttest(X, Y, paired=True)
                     T2         F  df1  df2      pval
    hotelling  4.468456  1.314252    3   15  0.306542

    One-sample Hotelling T-squared test with a specified null hypothesis

    >>> null_hypothesis_means = [37.5, 70, 5]
    >>> pg.multivariate_ttest(X, Y=null_hypothesis_means)
                       T2          F  df1  df2          pval
    hotelling  253.230991  74.479703    3   15  3.081281e-09
    """
    from scipy.stats import f
    x = np.asarray(X)
    assert x.ndim == 2, 'x must be of shape (n_samples, n_features)'

    if Y is None:
        y = np.zeros(x.shape[1])
        # Remove rows with missing values in x
        x = x[~np.isnan(x).any(axis=1)]
    else:
        nx, kx = x.shape
        y = np.asarray(Y)
        assert y.ndim in [1, 2], 'Y must be 1D or 2D.'
        if y.ndim == 1:
            # One sample with specified null
            assert y.size == kx
        else:
            # Two-sample
            err = 'X and Y must have the same number of features (= columns).'
            assert y.shape[1] == kx, err
            if paired:
                err = 'X and Y must have the same number of rows if paired.'
                assert y.shape[0] == nx, err
        # Remove rows with missing values in both x and y
        x, y = remove_na(x, y, paired=paired, axis='rows')

    # Shape of arrays
    nx, k = x.shape
    ny = y.shape[0]
    assert nx >= 5, 'At least five samples are required.'

    if y.ndim == 1 or paired is True:
        n = nx
        if y.ndim == 1:
            # One sample test
            cov = np.cov(x, rowvar=False)
            diff = x.mean(0) - y
        else:
            # Paired two sample
            cov = np.cov(x - y, rowvar=False)
            diff = x.mean(0) - y.mean(0)
        inv_cov = np.linalg.pinv(cov)
        t2 = (diff @ inv_cov) @ diff * n
    else:
        n = nx + ny - 1
        x_cov = np.cov(x, rowvar=False)
        y_cov = np.cov(y, rowvar=False)
        pooled_cov = ((nx - 1) * x_cov + (ny - 1) * y_cov) / (n - 1)
        inv_cov = np.linalg.pinv((1 / nx + 1 / ny) * pooled_cov)
        diff = x.mean(0) - y.mean(0)
        t2 = (diff @ inv_cov) @ diff

    # F-value, degrees of freedom and p-value
    fval = t2 * (n - k) / (k * (n - 1))
    df1 = k
    df2 = n - k
    pval = f.sf(fval, df1, df2)

    # Create output dictionnary
    stats = {'T2': t2, 'F': fval, 'df1': df1, 'df2': df2, 'pval': pval}
    stats = pd.DataFrame(stats, index=['hotelling'])
    return _postprocess_dataframe(stats)
Exemplo n.º 4
0
 def test_remove_na(self):
     """Test function remove_na."""
     x = [6.4, 3.2, 4.5, np.nan]
     y = [3.5, 7.2, 8.4, 3.2]
     z = [2.3, np.nan, 5.2, 4.6]
     remove_na(x, y, paired=True)
     remove_na(x, y, paired=False)
     remove_na(y, x, paired=False)
     x_out, _ = remove_na(x, z, paired=True)
     assert np.allclose(x_out, [6.4, 4.5])
     # When y is None
     remove_na(x, None)
     remove_na(x, 4)
     # With 2D arrays
     x = np.array([[4, 2], [4, np.nan], [7, 6]])
     y = np.array([[6, np.nan], [3, 2], [2, 2]])
     x_nan, y_nan = remove_na(x, y, paired=False)
     assert np.allclose(x_nan, [[4., 2.], [7., 6.]])
     assert np.allclose(y_nan, [[3., 2.], [2., 2.]])
     x_nan, y_nan = remove_na(x, y, paired=True)
     assert np.allclose(x_nan, [[7., 6.]])
     assert np.allclose(y_nan, [[2., 2.]])
     x_nan, y_nan = remove_na(x, y, paired=False, axis='columns')
     assert np.allclose(x_nan, [[4.], [4.], [7.]])
     assert np.allclose(y_nan, [[6.], [3.], [2.]])
     # When y is None
     remove_na(x, None, paired=False)
     # When x or y is an empty list
     # See https://github.com/raphaelvallat/pingouin/issues/222
     with pytest.raises(AssertionError):
         remove_na(x=[], y=0)
     with pytest.raises(AssertionError):
         remove_na(x, y=[])
Exemplo n.º 5
0
 def test_remove_na(self):
     """Test function remove_na."""
     x = [6.4, 3.2, 4.5, np.nan]
     y = [3.5, 7.2, 8.4, 3.2]
     z = [2.3, np.nan, 5.2, 4.6]
     remove_na(x, y, paired=True)
     remove_na(x, y, paired=False)
     remove_na(y, x, paired=False)
     x_out, _ = remove_na(x, z, paired=True)
     assert np.allclose(x_out, [6.4, 4.5])
     # When y is None
     remove_na(x, None)
     remove_na(x, 4)
     # With 2D arrays
     x = np.array([[4, 2], [4, np.nan], [7, 6]])
     y = np.array([[6, np.nan], [3, 2], [2, 2]])
     x_nan, y_nan = remove_na(x, y, paired=False)
     assert np.allclose(x_nan, [[4., 2.], [7., 6.]])
     assert np.allclose(y_nan, [[3., 2.], [2., 2.]])
     x_nan, y_nan = remove_na(x, y, paired=True)
     assert np.allclose(x_nan, [[7., 6.]])
     assert np.allclose(y_nan, [[2., 2.]])
     x_nan, y_nan = remove_na(x, y, paired=False, axis='columns')
     assert np.allclose(x_nan, [[4.], [4.], [7.]])
     assert np.allclose(y_nan, [[6.], [3.], [2.]])
     # When y is None
     remove_na(x, None, paired=False)
Exemplo n.º 6
0
def compute_effsize(x, y, paired=False, eftype='cohen'):
    """Calculate effect size between two set of observations.

    Parameters
    ----------
    x : np.array or list
        First set of observations.
    y : np.array or list
        Second set of observations.
    paired : boolean
        If True, uses Cohen d-avg formula to correct for repeated measurements
        (Cumming 2012)
    eftype : string
        Desired output effect size.
        Available methods are ::

        'none' : no effect size
        'cohen' : Unbiased Cohen d
        'hedges' : Hedges g
        'glass': Glass delta
        'r' : correlation coefficient
        'eta-square' : Eta-square
        'odds-ratio' : Odds ratio
        'AUC' : Area Under the Curve
        'CLES' : Common language effect size

    Returns
    -------
    ef : float
        Effect size

    See Also
    --------
    convert_effsize : Conversion between effect sizes.
    compute_effsize_from_t : Convert a T-statistic to an effect size.

    Notes
    -----
    Missing values are automatically removed from the data. If ``x`` and ``y``
    are paired, the entire row is removed.

    If ``x`` and ``y`` are independent, the Cohen's d is:

    .. math::

        d = \\frac{\\overline{X} - \\overline{Y}}
        {\\sqrt{\\frac{(n_{1} - 1)\\sigma_{1}^{2} + (n_{2} - 1)
        \\sigma_{2}^{2}}{n1 + n2 - 2}}}

    If ``x`` and ``y`` are paired, the Cohen :math:`d_{avg}` is computed:

    .. math::

        d_{avg} = \\frac{\\overline{X} - \\overline{Y}}
        {0.5 * (\\sigma_1 + \\sigma_2)}

    The Cohen’s d is a biased estimate of the population effect size,
    especially for small samples (n < 20). It is often preferable
    to use the corrected effect size, or Hedges’g, instead:

    .. math:: g = d * (1 - \\frac{3}{4(n_1 + n_2) - 9})

    If eftype = 'glass', the Glass :math:`\\delta` is reported, using the
    group with the lowest variance as the control group:

    .. math::

        \\delta = \\frac{\\overline{X} - \\overline{Y}}{\\sigma_{control}}

    References
    ----------
    .. [1] Lakens, D., 2013. Calculating and reporting effect sizes to
       facilitate cumulative science: a practical primer for t-tests and
       ANOVAs. Front. Psychol. 4, 863. https://doi.org/10.3389/fpsyg.2013.00863

    .. [2] Cumming, Geoff. Understanding the new statistics: Effect sizes,
           confidence intervals, and meta-analysis. Routledge, 2013.

    Examples
    --------
    1. Compute Cohen d from two independent set of observations.

    >>> import numpy as np
    >>> from pingouin import compute_effsize
    >>> np.random.seed(123)
    >>> x = np.random.normal(2, size=100)
    >>> y = np.random.normal(2.3, size=95)
    >>> d = compute_effsize(x=x, y=y, eftype='cohen', paired=False)
    >>> print(d)
    -0.2835170152506578

    2. Compute Hedges g from two paired set of observations.

    >>> import numpy as np
    >>> from pingouin import compute_effsize
    >>> x = [1.62, 2.21, 3.79, 1.66, 1.86, 1.87, 4.51, 4.49, 3.3 , 2.69]
    >>> y = [0.91, 3., 2.28, 0.49, 1.42, 3.65, -0.43, 1.57, 3.27, 1.13]
    >>> g = compute_effsize(x=x, y=y, eftype='hedges', paired=True)
    >>> print(g)
    0.8370985097811404

    3. Compute Glass delta from two independent set of observations. The group
       with the lowest variance will automatically be selected as the control.

    >>> import numpy as np
    >>> from pingouin import compute_effsize
    >>> np.random.seed(123)
    >>> x = np.random.normal(2, scale=1, size=50)
    >>> y = np.random.normal(2, scale=2, size=45)
    >>> d = compute_effsize(x=x, y=y, eftype='glass')
    >>> print(d)
    -0.1170721973604153
    """
    # Check arguments
    if not _check_eftype(eftype):
        err = "Could not interpret input '{}'".format(eftype)
        raise ValueError(err)

    x = np.asarray(x)
    y = np.asarray(y)

    if x.size != y.size and paired:
        warnings.warn("x and y have unequal sizes. Switching to "
                      "paired == False.")
        paired = False

    # Remove rows with missing values
    x, y = remove_na(x, y, paired=paired)
    nx, ny = x.size, y.size

    if ny == 1:
        # Case 1: One-sample Test
        d = (x.mean() - y) / x.std(ddof=1)
        return d
    if eftype.lower() == 'glass':
        # Find group with lowest variance
        sd_control = np.min([x.std(ddof=1), y.std(ddof=1)])
        d = (x.mean() - y.mean()) / sd_control
        return d
    elif eftype.lower() == 'r':
        # Return correlation coefficient (useful for CI bootstrapping)
        from scipy.stats import pearsonr
        r, _ = pearsonr(x, y)
        return r
    elif eftype.lower() == 'cles':
        # Compute exact CLES
        diff = x[:, None] - y
        return max((diff < 0).sum(), (diff > 0).sum()) / diff.size
    else:
        # Test equality of variance of data with a stringent threshold
        # equal_var, p = homoscedasticity(x, y, alpha=.001)
        # if not equal_var:
        #     print('Unequal variances (p<.001). You should report',
        #           'Glass delta instead.')

        # Compute unbiased Cohen's d effect size
        if not paired:
            # https://en.wikipedia.org/wiki/Effect_size
            dof = nx + ny - 2
            poolsd = np.sqrt(((nx - 1) * x.var(ddof=1)
                              + (ny - 1) * y.var(ddof=1)) / dof)
            d = (x.mean() - y.mean()) / poolsd
        else:
            # Report Cohen d-avg (Cumming 2012; Lakens 2013)
            d = (x.mean() - y.mean()) / (.5 * (x.std(ddof=1)
                                               + y.std(ddof=1)))
        return convert_effsize(d, 'cohen', eftype, nx=nx, ny=ny)
Exemplo n.º 7
0
def corr(x, y, tail='two-sided', method='pearson'):
    """(Robust) correlation between two variables.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. ``x`` and ``y`` must be
        independent.
    tail : string
        Specify whether to return ``'one-sided'`` or ``'two-sided'`` p-value.
        Note that the former are simply half the latter.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\\tau` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'outliers'``: number of outliers, only if a robust method was used
        * ``'r'``: Correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'r2'``: R-squared (:math:`= r^2`)
        * ``'adj_r2'``: Adjusted R-squared
        * ``'p-val'``: tail of the test
        * ``'BF10'``: Bayes Factor of the alternative hypothesis
          (only for Pearson correlation)
        * ``'power'``: achieved power of the test (= 1 - type II error).

    See also
    --------
    pairwise_corr : Pairwise correlation between columns of a pandas DataFrame
    partial_corr : Partial correlation
    rm_corr : Repeated measures correlation

    Notes
    -----
    The `Pearson correlation coefficient
    <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
    measures the linear relationship between two datasets. Strictly speaking,
    Pearson's correlation requires that each dataset be normally distributed.
    Correlations of -1 or +1 imply a perfect negative and positive linear
    relationship, respectively, with 0 indicating the absence of association.

    .. math::
        r_{xy} = \\frac{\\sum_i(x_i - \\bar{x})(y_i - \\bar{y})}
        {\\sqrt{\\sum_i(x_i - \\bar{x})^2} \\sqrt{\\sum_i(y_i - \\bar{y})^2}}
        = \\frac{\\text{cov}(x, y)}{\\sigma_x \\sigma_y}

    where :math:`\\text{cov}` is the sample covariance and :math:`\\sigma`
    is the sample standard deviation.

    If ``method='pearson'``, The Bayes Factor is calculated using the
    :py:func:`pingouin.bayesfactor_pearson` function.

    The `Spearman correlation coefficient
    <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
    is a non-parametric measure of the monotonicity of the relationship between
    two datasets. Unlike the Pearson correlation, the Spearman correlation does
    not assume that both datasets are normally distributed. Correlations of -1
    or +1 imply an exact negative and positive monotonic relationship,
    respectively. Mathematically, the Spearman correlation coefficient is
    defined as the Pearson correlation coefficient between the
    `rank variables <https://en.wikipedia.org/wiki/Ranking>`_.

    The `Kendall correlation coefficient
    <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
    is a measure of the correspondence between two rankings. Values also range
    from -1 (perfect disagreement) to 1 (perfect agreement), with 0 indicating
    the absence of association. Consistent with
    :py:func:`scipy.stats.kendalltau`, Pingouin returns the Tau-b coefficient,
    which adjusts for ties:

    .. math:: \\tau_B = \\frac{(P - Q)}{\\sqrt{(P + Q + T) (P + Q + U)}}

    where :math:`P` is the number of concordant pairs, :math:`Q` the number of
    discordand pairs, :math:`T` the number of ties in x, and :math:`U`
    the number of ties in y.

    The `biweight midcorrelation
    <https://en.wikipedia.org/wiki/Biweight_midcorrelation>`_ and
    percentage bend correlation [1]_ are both robust methods that
    protects against *univariate* outliers by down-weighting observations that
    deviate too much from the median.

    The Shepherd pi [2]_ correlation and skipped [3]_, [4]_ correlation are
    both robust methods that returns the Spearman correlation coefficient after
    removing *bivariate* outliers. Briefly, the Shepherd pi uses a
    bootstrapping of the Mahalanobis distance to identify outliers, while the
    skipped correlation is based on the minimum covariance determinant
    (which requires scikit-learn). Note that these two methods are
    significantly slower than the previous ones.

    .. important:: Please note that rows with missing values (NaN) are
        automatically removed.

    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395

    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200

    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119

    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606

    Examples
    --------
    1. Pearson correlation

    >>> import numpy as np
    >>> import pingouin as pg
    >>> # Generate random correlated samples
    >>> np.random.seed(123)
    >>> mean, cov = [4, 6], [(1, .5), (.5, 1)]
    >>> x, y = np.random.multivariate_normal(mean, cov, 30).T
    >>> # Compute Pearson correlation
    >>> pg.corr(x, y).round(3)
              n      r         CI95%     r2  adj_r2  p-val  BF10  power
    pearson  30  0.491  [0.16, 0.72]  0.242   0.185  0.006  8.55  0.809

    2. Pearson correlation with two outliers

    >>> x[3], y[5] = 12, -8
    >>> pg.corr(x, y).round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.302  0.121

    3. Spearman correlation (robust to outliers)

    >>> pg.corr(x, y, method="spearman").round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    spearman  30  0.401  [0.05, 0.67]  0.161   0.099  0.028   0.61

    4. Biweight midcorrelation (robust)

    >>> pg.corr(x, y, method="bicor").round(3)
            n      r         CI95%     r2  adj_r2  p-val  power
    bicor  30  0.393  [0.04, 0.66]  0.155   0.092  0.031  0.592

    5. Percentage bend correlation (robust)

    >>> pg.corr(x, y, method='percbend').round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    percbend  30  0.389  [0.03, 0.66]  0.151   0.089  0.034  0.581

    6. Shepherd's pi correlation (robust)

    >>> pg.corr(x, y, method='shepherd').round(3)
               n  outliers      r         CI95%     r2  adj_r2  p-val  power
    shepherd  30         2  0.437  [0.09, 0.69]  0.191   0.131   0.02  0.694

    7. Skipped spearman correlation (robust)

    >>> pg.corr(x, y, method='skipped').round(3)
              n  outliers      r         CI95%     r2  adj_r2  p-val  power
    skipped  30         2  0.437  [0.09, 0.69]  0.191   0.131   0.02  0.694

    8. One-tailed Pearson correlation

    >>> pg.corr(x, y, tail="one-sided", method='pearson').round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051   0.22  0.467  0.194

    9. Using columns of a pandas dataframe

    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': x, 'y': y})
    >>> pg.corr(data['x'], data['y']).round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.302  0.121
    """
    # Safety check
    x = np.asarray(x)
    y = np.asarray(y)
    assert x.ndim == y.ndim == 1, 'x and y must be 1D array.'
    assert x.size == y.size, 'x and y must have the same length.'

    # Remove rows with missing values
    x, y = remove_na(x, y, paired=True)
    nx = x.size

    # Compute correlation coefficient
    if method == 'pearson':
        r, pval = pearsonr(x, y)
    elif method == 'spearman':
        r, pval = spearmanr(x, y)
    elif method == 'kendall':
        r, pval = kendalltau(x, y)
    elif method == 'bicor':
        r, pval = bicor(x, y)
    elif method == 'percbend':
        r, pval = percbend(x, y)
    elif method == 'shepherd':
        r, pval, outliers = shepherd(x, y)
    elif method == 'skipped':
        r, pval, outliers = skipped(x, y)
    else:
        raise ValueError('Method not recognized.')

    if np.isnan(r):
        # Correlation failed -- new in version v0.3.4, instead of raising an
        # error we just return a dataframe full of NaN (except sample size).
        # This avoid sudden stop in pingouin.pairwise_corr.
        return pd.DataFrame(
            {
                'n': nx,
                'r': np.nan,
                'CI95%': np.nan,
                'r2': np.nan,
                'adj_r2': np.nan,
                'p-val': np.nan,
                'BF10': np.nan,
                'power': np.nan
            },
            index=[method])

    # Compute r2 and adj_r2
    r2 = r**2
    adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3))

    # Compute the parametric 95% confidence interval and power
    ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r')
    pr = power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail),

    # Create dictionnary
    stats = {
        'n': nx,
        'r': r,
        'r2': r2,
        'adj_r2': adj_r2,
        'CI95%': [ci],
        'p-val': pval if tail == 'two-sided' else .5 * pval,
        'power': pr
    }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = sum(outliers)

    # Compute the BF10 for Pearson correlation only
    if method == 'pearson':
        stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail)

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = [
        'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power'
    ]
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return stats[col_order]
Exemplo n.º 8
0
def normality(data, dv=None, group=None, method="shapiro", alpha=.05):
    """Univariate normality test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`, series, list or 1D np.array
        Iterable. Can be either a single list, 1D numpy array,
        or a wide- or long-format pandas dataframe.
    dv : str
        Dependent variable (only when ``data`` is a long-format dataframe).
    group : str
        Grouping variable (only when ``data`` is a long-format dataframe).
    method : str
        Normality test. `'shapiro'` (default) performs the Shapiro-Wilk test
        using :py:func:`scipy.stats.shapiro`, and `'normaltest'` performs the
        omnibus test of normality using :py:func:`scipy.stats.normaltest`.
        The latter is more appropriate for large samples.
    alpha : float
        Significance level.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W'``: Test statistic.
        * ``'pval'``: p-value.
        * ``'normal'``: True if ``data`` is normally distributed.

    See Also
    --------
    homoscedasticity : Test equality of variance.
    sphericity : Mauchly's test for sphericity.

    Notes
    -----
    The Shapiro-Wilk test calculates a :math:`W` statistic that tests whether a
    random sample :math:`x_1, x_2, ..., x_n` comes from a normal distribution.

    The :math:`W` statistic is calculated as follows:

    .. math::

        W = \\frac{(\\sum_{i=1}^n a_i x_{i})^2}
        {\\sum_{i=1}^n (x_i - \\overline{x})^2}

    where the :math:`x_i` are the ordered sample values (in ascending
    order) and the :math:`a_i` are constants generated from the means,
    variances and covariances of the order statistics of a sample of size
    :math:`n` from a standard normal distribution. Specifically:

    .. math:: (a_1, ..., a_n) = \\frac{m^TV^{-1}}{(m^TV^{-1}V^{-1}m)^{1/2}}

    with :math:`m = (m_1, ..., m_n)^T` and :math:`(m_1, ..., m_n)` are the
    expected values of the order statistics of independent and identically
    distributed random variables sampled from the standard normal distribution,
    and :math:`V` is the covariance matrix of those order statistics.

    The null-hypothesis of this test is that the population is normally
    distributed. Thus, if the p-value is less than the
    chosen alpha level (typically set at 0.05), then the null hypothesis is
    rejected and there is evidence that the data tested are not normally
    distributed.

    The result of the Shapiro-Wilk test should be interpreted with caution in
    the case of large sample sizes. Indeed, quoting from
    `Wikipedia <https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test>`_:

        *"Like most statistical significance tests, if the sample size is
        sufficiently large this test may detect even trivial departures from
        the null hypothesis (i.e., although there may be some statistically
        significant effect, it may be too small to be of any practical
        significance); thus, additional investigation of the effect size is
        typically advisable, e.g., a Q–Q plot in this case."*

    Note that missing values are automatically removed (casewise deletion).

    References
    ----------
    * Shapiro, S. S., & Wilk, M. B. (1965). An analysis of variance test
      for normality (complete samples). Biometrika, 52(3/4), 591-611.

    * https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm

    Examples
    --------
    1. Shapiro-Wilk test on a 1D array.

    >>> import numpy as np
    >>> import pingouin as pg
    >>> np.random.seed(123)
    >>> x = np.random.normal(size=100)
    >>> pg.normality(x)
             W      pval  normal
    0  0.98414  0.274886    True

    2. Omnibus test on a wide-format dataframe with missing values

    >>> data = pg.read_dataset('mediation')
    >>> data.loc[1, 'X'] = np.nan
    >>> pg.normality(data, method='normaltest').round(3)
                W   pval  normal
    X       1.792  0.408    True
    M       0.492  0.782    True
    Y       0.349  0.840    True
    Mbin  839.716  0.000   False
    Ybin  814.468  0.000   False
    W1     24.816  0.000   False
    W2     43.400  0.000   False

    3. Pandas Series

    >>> pg.normality(data['X'], method='normaltest')
              W      pval  normal
    X  1.791839  0.408232    True

    4. Long-format dataframe

    >>> data = pg.read_dataset('rm_anova2')
    >>> pg.normality(data, dv='Performance', group='Time')
                 W      pval  normal
    Pre   0.967718  0.478773    True
    Post  0.940728  0.095157    True
    """
    assert isinstance(data, (pd.DataFrame, pd.Series, list, np.ndarray))
    assert method in ['shapiro', 'normaltest']
    if isinstance(data, pd.Series):
        data = data.to_frame()
    col_names = ['W', 'pval']
    func = getattr(scipy.stats, method)
    if isinstance(data, (list, np.ndarray)):
        data = np.asarray(data)
        assert data.ndim == 1, 'Data must be 1D.'
        assert data.size > 3, 'Data must have more than 3 samples.'
        data = remove_na(data)
        stats = pd.DataFrame(func(data)).T
        stats.columns = col_names
        stats['normal'] = np.where(stats['pval'] > alpha, True, False)
    else:
        # Data is a Pandas DataFrame
        if dv is None and group is None:
            # Wide-format
            # Get numeric data only
            numdata = data._get_numeric_data()
            stats = numdata.apply(lambda x: func(x.dropna()),
                                  result_type='expand',
                                  axis=0).T
            stats.columns = col_names
            stats['normal'] = np.where(stats['pval'] > alpha, True, False)
        else:
            # Long-format
            stats = pd.DataFrame([])
            assert group in data.columns
            assert dv in data.columns
            grp = data.groupby(group, observed=True, sort=False)
            cols = grp.groups.keys()
            for _, tmp in grp:
                stats = stats.append(
                    normality(tmp[dv].to_numpy(), method=method, alpha=alpha))
            stats.index = cols
    return _postprocess_dataframe(stats)