def compute_effsize(x, y, paired=False, eftype='cohen'): """Calculate effect size between two set of observations. Parameters ---------- x : np.array or list First set of observations. y : np.array or list Second set of observations. paired : boolean If True, uses Cohen d-avg formula to correct for repeated measurements (see Notes). eftype : string Desired output effect size. Available methods are: * ``'none'``: no effect size * ``'cohen'``: Unbiased Cohen d * ``'hedges'``: Hedges g * ``'glass'``: Glass delta * ``'r'``: correlation coefficient * ``'eta-square'``: Eta-square * ``'odds-ratio'``: Odds ratio * ``'AUC'``: Area Under the Curve * ``'CLES'``: Common Language Effect Size Returns ------- ef : float Effect size See Also -------- convert_effsize : Conversion between effect sizes. compute_effsize_from_t : Convert a T-statistic to an effect size. Notes ----- Missing values are automatically removed from the data. If ``x`` and ``y`` are paired, the entire row is removed. If ``x`` and ``y`` are independent, the Cohen :math:`d` is: .. math:: d = \\frac{\\overline{X} - \\overline{Y}} {\\sqrt{\\frac{(n_{1} - 1)\\sigma_{1}^{2} + (n_{2} - 1) \\sigma_{2}^{2}}{n1 + n2 - 2}}} If ``x`` and ``y`` are paired, the Cohen :math:`d_{avg}` is computed: .. math:: d_{avg} = \\frac{\\overline{X} - \\overline{Y}} {\\sqrt{\\frac{(\\sigma_1^2 + \\sigma_2^2)}{2}}} The Cohen’s d is a biased estimate of the population effect size, especially for small samples (n < 20). It is often preferable to use the corrected Hedges :math:`g` instead: .. math:: g = d \\times (1 - \\frac{3}{4(n_1 + n_2) - 9}) The Glass :math:`\\delta` is calculated using the group with the lowest variance as the control group: .. math:: \\delta = \\frac{\\overline{X} - \\overline{Y}}{\\sigma^2_{\\text{control}}} The common language effect size is the proportion of pairs where ``x`` is higher than ``y`` (calculated with a brute-force approach where each observation of ``x`` is paired to each observation of ``y``, see :py:func:`pingouin.wilcoxon` for more details): .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y) For other effect sizes, Pingouin will first calculate a Cohen :math:`d` and then use the :py:func:`pingouin.convert_effsize` to convert to the desired effect size. References ---------- * Lakens, D., 2013. Calculating and reporting effect sizes to facilitate cumulative science: a practical primer for t-tests and ANOVAs. Front. Psychol. 4, 863. https://doi.org/10.3389/fpsyg.2013.00863 * Cumming, Geoff. Understanding the new statistics: Effect sizes, confidence intervals, and meta-analysis. Routledge, 2013. * https://osf.io/vbdah/ Examples -------- 1. Cohen d from two independent samples. >>> import numpy as np >>> import pingouin as pg >>> x = [1, 2, 3, 4] >>> y = [3, 4, 5, 6, 7] >>> pg.compute_effsize(x, y, paired=False, eftype='cohen') -1.707825127659933 The sign of the Cohen d will be opposite if we reverse the order of ``x`` and ``y``: >>> pg.compute_effsize(y, x, paired=False, eftype='cohen') 1.707825127659933 2. Hedges g from two paired samples. >>> x = [1, 2, 3, 4, 5, 6, 7] >>> y = [1, 3, 5, 7, 9, 11, 13] >>> pg.compute_effsize(x, y, paired=True, eftype='hedges') -0.8222477210374874 3. Glass delta from two independent samples. The group with the lowest variance will automatically be selected as the control. >>> pg.compute_effsize(x, y, paired=False, eftype='glass') -1.3887301496588271 4. Common Language Effect Size. >>> pg.compute_effsize(x, y, eftype='cles') 0.2857142857142857 In other words, there are ~29% of pairs where ``x`` is higher than ``y``, which means that there are ~71% of pairs where ``x`` is *lower* than ``y``. This can be easily verified by changing the order of ``x`` and ``y``: >>> pg.compute_effsize(y, x, eftype='cles') 0.7142857142857143 """ # Check arguments if not _check_eftype(eftype): err = "Could not interpret input '{}'".format(eftype) raise ValueError(err) x = np.asarray(x) y = np.asarray(y) if x.size != y.size and paired: warnings.warn("x and y have unequal sizes. Switching to " "paired == False.") paired = False # Remove rows with missing values x, y = remove_na(x, y, paired=paired) nx, ny = x.size, y.size if ny == 1: # Case 1: One-sample Test d = (x.mean() - y) / x.std(ddof=1) return d if eftype.lower() == 'glass': # Find group with lowest variance sd_control = np.min([x.std(ddof=1), y.std(ddof=1)]) d = (x.mean() - y.mean()) / sd_control return d elif eftype.lower() == 'r': # Return correlation coefficient (useful for CI bootstrapping) from scipy.stats import pearsonr r, _ = pearsonr(x, y) return r elif eftype.lower() == 'cles': # Compute exact CLES (see pingouin.wilcoxon) diff = x[:, None] - y return np.where(diff == 0, 0.5, diff > 0).mean() else: # Test equality of variance of data with a stringent threshold # equal_var, p = homoscedasticity(x, y, alpha=.001) # if not equal_var: # print('Unequal variances (p<.001). You should report', # 'Glass delta instead.') # Compute unbiased Cohen's d effect size if not paired: # https://en.wikipedia.org/wiki/Effect_size dof = nx + ny - 2 poolsd = np.sqrt( ((nx - 1) * x.var(ddof=1) + (ny - 1) * y.var(ddof=1)) / dof) d = (x.mean() - y.mean()) / poolsd else: # Report Cohen d-avg (Cumming 2012; Lakens 2013) # Careful, the formula in Lakens 2013 is wrong. Updated in Pingouin # v0.3.4 to use the formula provided by Cummings 2012. # Before that the denominator was just (SD1 + SD2) / 2 d = (x.mean() - y.mean()) / np.sqrt( (x.var(ddof=1) + y.var(ddof=1)) / 2) return convert_effsize(d, 'cohen', eftype, nx=nx, ny=ny)
def corr(x, y, tail='two-sided', method='pearson'): """(Robust) correlation between two variables. Parameters ---------- x, y : array_like First and second set of observations. x and y must be independent. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. method : string Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) Returns ------- stats : pandas DataFrame Test summary :: 'n' : Sample size (after NaN removal) 'outliers' : number of outliers (only for 'shepherd' or 'skipped') 'r' : Correlation coefficient 'CI95' : 95% parametric confidence intervals 'r2' : R-squared 'adj_r2' : Adjusted R-squared 'p-val' : one or two tailed p-value 'BF10' : Bayes Factor of the alternative hypothesis (Pearson only) 'power' : achieved power of the test (= 1 - type II error). See also -------- pairwise_corr : Pairwise correlation between columns of a pandas DataFrame partial_corr : Partial correlation Notes ----- The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply an exact linear relationship. The Spearman correlation is a nonparametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact monotonic relationship. Kendall’s tau is a measure of the correspondence between two rankings. Values close to 1 indicate strong agreement, values close to -1 indicate strong disagreement. The percentage bend correlation [1]_ is a robust method that protects against univariate outliers. The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust methods that returns the Spearman's rho after bivariate outliers removal. Note that the skipped correlation requires that the scikit-learn package is installed (for computing the minimum covariance determinant). Please note that rows with NaN are automatically removed. If ``method='pearson'``, The Bayes Factor is calculated using the :py:func:`pingouin.bayesfactor_pearson` function. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 Examples -------- 1. Pearson correlation >>> import numpy as np >>> # Generate random correlated samples >>> np.random.seed(123) >>> mean, cov = [4, 6], [(1, .5), (.5, 1)] >>> x, y = np.random.multivariate_normal(mean, cov, 30).T >>> # Compute Pearson correlation >>> from pingouin import corr >>> corr(x, y) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.491 [0.16, 0.72] 0.242 0.185 0.005813 8.55 0.809 2. Pearson correlation with two outliers >>> x[3], y[5] = 12, -8 >>> corr(x, y) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439148 0.302 0.121 3. Spearman correlation >>> corr(x, y, method="spearman") n r CI95% r2 adj_r2 p-val power spearman 30 0.401 [0.05, 0.67] 0.161 0.099 0.028034 0.61 4. Percentage bend correlation (robust) >>> corr(x, y, method='percbend') n r CI95% r2 adj_r2 p-val power percbend 30 0.389 [0.03, 0.66] 0.151 0.089 0.033508 0.581 5. Shepherd's pi correlation (robust) >>> corr(x, y, method='shepherd') n outliers r CI95% r2 adj_r2 p-val power shepherd 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.020128 0.694 6. Skipped spearman correlation (robust) >>> corr(x, y, method='skipped') n outliers r CI95% r2 adj_r2 p-val power skipped 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.020128 0.694 7. One-tailed Pearson correlation >>> corr(x, y, tail="one-sided", method='pearson') n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.219574 0.467 0.194 8. Using columns of a pandas dataframe >>> import pandas as pd >>> data = pd.DataFrame({'x': x, 'y': y}) >>> corr(data['x'], data['y']) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439148 0.302 0.121 """ x = np.asarray(x) y = np.asarray(y) # Check size if x.size != y.size: raise ValueError('x and y must have the same length.') # Remove NA x, y = remove_na(x, y, paired=True) nx = x.size # Compute correlation coefficient if method == 'pearson': r, pval = pearsonr(x, y) elif method == 'spearman': r, pval = spearmanr(x, y) elif method == 'kendall': r, pval = kendalltau(x, y) elif method == 'percbend': r, pval = percbend(x, y) elif method == 'shepherd': r, pval, outliers = shepherd(x, y) elif method == 'skipped': r, pval, outliers = skipped(x, y, method='spearman') else: raise ValueError('Method not recognized.') assert not np.isnan(r), 'Correlation returned NaN. Check your data.' # Compute r2 and adj_r2 r2 = r**2 adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3)) # Compute the parametric 95% confidence interval and power if r2 < 1: ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r') pr = round(power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail), 3) else: ci = [1., 1.] pr = np.inf # Create dictionnary stats = { 'n': nx, 'r': round(r, 3), 'r2': round(r2, 3), 'adj_r2': round(adj_r2, 3), 'CI95%': [ci], 'p-val': pval if tail == 'two-sided' else .5 * pval, 'power': pr } if method in ['shepherd', 'skipped']: stats['outliers'] = sum(outliers) # Compute the BF10 for Pearson correlation only if method == 'pearson': if r2 < 1: stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail) else: stats['BF10'] = str(np.inf) # Convert to DataFrame stats = pd.DataFrame.from_records(stats, index=[method]) # Define order col_keep = [ 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power' ] col_order = [k for k in col_keep if k in stats.keys().tolist()] return stats[col_order]
def multivariate_ttest(X, Y=None, paired=False): """Hotelling T-squared test (= multivariate T-test) Parameters ---------- X : np.array First data matrix of shape (n_samples, n_features). Y : np.array or None Second data matrix of shape (n_samples, n_features). If ``Y`` is a 1D array of shape (n_features), a one-sample test is performed where the null hypothesis is defined in ``Y``. If ``Y`` is None, a one-sample is performed against np.zeros(n_features). paired : boolean Specify whether the two observations are related (i.e. repeated measures) or independent. If ``paired`` is True, ``X`` and ``Y`` must have exactly the same shape. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'T2'``: T-squared value * ``'F'``: F-value * ``'df1'``: first degree of freedom * ``'df2'``: second degree of freedom * ``'p-val'``: p-value See Also -------- multivariate_normality : Multivariate normality test. ttest : Univariate T-test. Notes ----- The Hotelling 's T-squared test [1]_ is the multivariate counterpart of the T-test. Rows with missing values are automatically removed using the :py:func:`remove_na` function. Tested against the `Hotelling <https://cran.r-project.org/web/packages/Hotelling/Hotelling.pdf>`_ R package. References ---------- .. [1] Hotelling, H. The Generalization of Student's Ratio. Ann. Math. Statist. 2 (1931), no. 3, 360--378. See also http://www.real-statistics.com/multivariate-statistics/ Examples -------- Two-sample independent Hotelling T-squared test >>> import pingouin as pg >>> data = pg.read_dataset('multivariate') >>> dvs = ['Fever', 'Pressure', 'Aches'] >>> X = data[data['Condition'] == 'Drug'][dvs] >>> Y = data[data['Condition'] == 'Placebo'][dvs] >>> pg.multivariate_ttest(X, Y) T2 F df1 df2 pval hotelling 4.228679 1.326644 3 32 0.282898 Two-sample paired Hotelling T-squared test >>> pg.multivariate_ttest(X, Y, paired=True) T2 F df1 df2 pval hotelling 4.468456 1.314252 3 15 0.306542 One-sample Hotelling T-squared test with a specified null hypothesis >>> null_hypothesis_means = [37.5, 70, 5] >>> pg.multivariate_ttest(X, Y=null_hypothesis_means) T2 F df1 df2 pval hotelling 253.230991 74.479703 3 15 3.081281e-09 """ from scipy.stats import f x = np.asarray(X) assert x.ndim == 2, 'x must be of shape (n_samples, n_features)' if Y is None: y = np.zeros(x.shape[1]) # Remove rows with missing values in x x = x[~np.isnan(x).any(axis=1)] else: nx, kx = x.shape y = np.asarray(Y) assert y.ndim in [1, 2], 'Y must be 1D or 2D.' if y.ndim == 1: # One sample with specified null assert y.size == kx else: # Two-sample err = 'X and Y must have the same number of features (= columns).' assert y.shape[1] == kx, err if paired: err = 'X and Y must have the same number of rows if paired.' assert y.shape[0] == nx, err # Remove rows with missing values in both x and y x, y = remove_na(x, y, paired=paired, axis='rows') # Shape of arrays nx, k = x.shape ny = y.shape[0] assert nx >= 5, 'At least five samples are required.' if y.ndim == 1 or paired is True: n = nx if y.ndim == 1: # One sample test cov = np.cov(x, rowvar=False) diff = x.mean(0) - y else: # Paired two sample cov = np.cov(x - y, rowvar=False) diff = x.mean(0) - y.mean(0) inv_cov = np.linalg.pinv(cov) t2 = (diff @ inv_cov) @ diff * n else: n = nx + ny - 1 x_cov = np.cov(x, rowvar=False) y_cov = np.cov(y, rowvar=False) pooled_cov = ((nx - 1) * x_cov + (ny - 1) * y_cov) / (n - 1) inv_cov = np.linalg.pinv((1 / nx + 1 / ny) * pooled_cov) diff = x.mean(0) - y.mean(0) t2 = (diff @ inv_cov) @ diff # F-value, degrees of freedom and p-value fval = t2 * (n - k) / (k * (n - 1)) df1 = k df2 = n - k pval = f.sf(fval, df1, df2) # Create output dictionnary stats = {'T2': t2, 'F': fval, 'df1': df1, 'df2': df2, 'pval': pval} stats = pd.DataFrame(stats, index=['hotelling']) return _postprocess_dataframe(stats)
def test_remove_na(self): """Test function remove_na.""" x = [6.4, 3.2, 4.5, np.nan] y = [3.5, 7.2, 8.4, 3.2] z = [2.3, np.nan, 5.2, 4.6] remove_na(x, y, paired=True) remove_na(x, y, paired=False) remove_na(y, x, paired=False) x_out, _ = remove_na(x, z, paired=True) assert np.allclose(x_out, [6.4, 4.5]) # When y is None remove_na(x, None) remove_na(x, 4) # With 2D arrays x = np.array([[4, 2], [4, np.nan], [7, 6]]) y = np.array([[6, np.nan], [3, 2], [2, 2]]) x_nan, y_nan = remove_na(x, y, paired=False) assert np.allclose(x_nan, [[4., 2.], [7., 6.]]) assert np.allclose(y_nan, [[3., 2.], [2., 2.]]) x_nan, y_nan = remove_na(x, y, paired=True) assert np.allclose(x_nan, [[7., 6.]]) assert np.allclose(y_nan, [[2., 2.]]) x_nan, y_nan = remove_na(x, y, paired=False, axis='columns') assert np.allclose(x_nan, [[4.], [4.], [7.]]) assert np.allclose(y_nan, [[6.], [3.], [2.]]) # When y is None remove_na(x, None, paired=False) # When x or y is an empty list # See https://github.com/raphaelvallat/pingouin/issues/222 with pytest.raises(AssertionError): remove_na(x=[], y=0) with pytest.raises(AssertionError): remove_na(x, y=[])
def test_remove_na(self): """Test function remove_na.""" x = [6.4, 3.2, 4.5, np.nan] y = [3.5, 7.2, 8.4, 3.2] z = [2.3, np.nan, 5.2, 4.6] remove_na(x, y, paired=True) remove_na(x, y, paired=False) remove_na(y, x, paired=False) x_out, _ = remove_na(x, z, paired=True) assert np.allclose(x_out, [6.4, 4.5]) # When y is None remove_na(x, None) remove_na(x, 4) # With 2D arrays x = np.array([[4, 2], [4, np.nan], [7, 6]]) y = np.array([[6, np.nan], [3, 2], [2, 2]]) x_nan, y_nan = remove_na(x, y, paired=False) assert np.allclose(x_nan, [[4., 2.], [7., 6.]]) assert np.allclose(y_nan, [[3., 2.], [2., 2.]]) x_nan, y_nan = remove_na(x, y, paired=True) assert np.allclose(x_nan, [[7., 6.]]) assert np.allclose(y_nan, [[2., 2.]]) x_nan, y_nan = remove_na(x, y, paired=False, axis='columns') assert np.allclose(x_nan, [[4.], [4.], [7.]]) assert np.allclose(y_nan, [[6.], [3.], [2.]]) # When y is None remove_na(x, None, paired=False)
def compute_effsize(x, y, paired=False, eftype='cohen'): """Calculate effect size between two set of observations. Parameters ---------- x : np.array or list First set of observations. y : np.array or list Second set of observations. paired : boolean If True, uses Cohen d-avg formula to correct for repeated measurements (Cumming 2012) eftype : string Desired output effect size. Available methods are :: 'none' : no effect size 'cohen' : Unbiased Cohen d 'hedges' : Hedges g 'glass': Glass delta 'r' : correlation coefficient 'eta-square' : Eta-square 'odds-ratio' : Odds ratio 'AUC' : Area Under the Curve 'CLES' : Common language effect size Returns ------- ef : float Effect size See Also -------- convert_effsize : Conversion between effect sizes. compute_effsize_from_t : Convert a T-statistic to an effect size. Notes ----- Missing values are automatically removed from the data. If ``x`` and ``y`` are paired, the entire row is removed. If ``x`` and ``y`` are independent, the Cohen's d is: .. math:: d = \\frac{\\overline{X} - \\overline{Y}} {\\sqrt{\\frac{(n_{1} - 1)\\sigma_{1}^{2} + (n_{2} - 1) \\sigma_{2}^{2}}{n1 + n2 - 2}}} If ``x`` and ``y`` are paired, the Cohen :math:`d_{avg}` is computed: .. math:: d_{avg} = \\frac{\\overline{X} - \\overline{Y}} {0.5 * (\\sigma_1 + \\sigma_2)} The Cohen’s d is a biased estimate of the population effect size, especially for small samples (n < 20). It is often preferable to use the corrected effect size, or Hedges’g, instead: .. math:: g = d * (1 - \\frac{3}{4(n_1 + n_2) - 9}) If eftype = 'glass', the Glass :math:`\\delta` is reported, using the group with the lowest variance as the control group: .. math:: \\delta = \\frac{\\overline{X} - \\overline{Y}}{\\sigma_{control}} References ---------- .. [1] Lakens, D., 2013. Calculating and reporting effect sizes to facilitate cumulative science: a practical primer for t-tests and ANOVAs. Front. Psychol. 4, 863. https://doi.org/10.3389/fpsyg.2013.00863 .. [2] Cumming, Geoff. Understanding the new statistics: Effect sizes, confidence intervals, and meta-analysis. Routledge, 2013. Examples -------- 1. Compute Cohen d from two independent set of observations. >>> import numpy as np >>> from pingouin import compute_effsize >>> np.random.seed(123) >>> x = np.random.normal(2, size=100) >>> y = np.random.normal(2.3, size=95) >>> d = compute_effsize(x=x, y=y, eftype='cohen', paired=False) >>> print(d) -0.2835170152506578 2. Compute Hedges g from two paired set of observations. >>> import numpy as np >>> from pingouin import compute_effsize >>> x = [1.62, 2.21, 3.79, 1.66, 1.86, 1.87, 4.51, 4.49, 3.3 , 2.69] >>> y = [0.91, 3., 2.28, 0.49, 1.42, 3.65, -0.43, 1.57, 3.27, 1.13] >>> g = compute_effsize(x=x, y=y, eftype='hedges', paired=True) >>> print(g) 0.8370985097811404 3. Compute Glass delta from two independent set of observations. The group with the lowest variance will automatically be selected as the control. >>> import numpy as np >>> from pingouin import compute_effsize >>> np.random.seed(123) >>> x = np.random.normal(2, scale=1, size=50) >>> y = np.random.normal(2, scale=2, size=45) >>> d = compute_effsize(x=x, y=y, eftype='glass') >>> print(d) -0.1170721973604153 """ # Check arguments if not _check_eftype(eftype): err = "Could not interpret input '{}'".format(eftype) raise ValueError(err) x = np.asarray(x) y = np.asarray(y) if x.size != y.size and paired: warnings.warn("x and y have unequal sizes. Switching to " "paired == False.") paired = False # Remove rows with missing values x, y = remove_na(x, y, paired=paired) nx, ny = x.size, y.size if ny == 1: # Case 1: One-sample Test d = (x.mean() - y) / x.std(ddof=1) return d if eftype.lower() == 'glass': # Find group with lowest variance sd_control = np.min([x.std(ddof=1), y.std(ddof=1)]) d = (x.mean() - y.mean()) / sd_control return d elif eftype.lower() == 'r': # Return correlation coefficient (useful for CI bootstrapping) from scipy.stats import pearsonr r, _ = pearsonr(x, y) return r elif eftype.lower() == 'cles': # Compute exact CLES diff = x[:, None] - y return max((diff < 0).sum(), (diff > 0).sum()) / diff.size else: # Test equality of variance of data with a stringent threshold # equal_var, p = homoscedasticity(x, y, alpha=.001) # if not equal_var: # print('Unequal variances (p<.001). You should report', # 'Glass delta instead.') # Compute unbiased Cohen's d effect size if not paired: # https://en.wikipedia.org/wiki/Effect_size dof = nx + ny - 2 poolsd = np.sqrt(((nx - 1) * x.var(ddof=1) + (ny - 1) * y.var(ddof=1)) / dof) d = (x.mean() - y.mean()) / poolsd else: # Report Cohen d-avg (Cumming 2012; Lakens 2013) d = (x.mean() - y.mean()) / (.5 * (x.std(ddof=1) + y.std(ddof=1))) return convert_effsize(d, 'cohen', eftype, nx=nx, ny=ny)
def corr(x, y, tail='two-sided', method='pearson'): """(Robust) correlation between two variables. Parameters ---------- x, y : array_like First and second set of observations. ``x`` and ``y`` must be independent. tail : string Specify whether to return ``'one-sided'`` or ``'two-sided'`` p-value. Note that the former are simply half the latter. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation * ``'kendall'``: Kendall's :math:`\\tau` correlation (for ordinal data) * ``'bicor'``: Biweight midcorrelation (robust) * ``'percbend'``: Percentage bend correlation (robust) * ``'shepherd'``: Shepherd's pi correlation (robust) * ``'skipped'``: Skipped correlation (robust) Returns ------- stats : :py:class:`pandas.DataFrame` * ``'n'``: Sample size (after removal of missing values) * ``'outliers'``: number of outliers, only if a robust method was used * ``'r'``: Correlation coefficient * ``'CI95'``: 95% parametric confidence intervals around :math:`r` * ``'r2'``: R-squared (:math:`= r^2`) * ``'adj_r2'``: Adjusted R-squared * ``'p-val'``: tail of the test * ``'BF10'``: Bayes Factor of the alternative hypothesis (only for Pearson correlation) * ``'power'``: achieved power of the test (= 1 - type II error). See also -------- pairwise_corr : Pairwise correlation between columns of a pandas DataFrame partial_corr : Partial correlation rm_corr : Repeated measures correlation Notes ----- The `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_ measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply a perfect negative and positive linear relationship, respectively, with 0 indicating the absence of association. .. math:: r_{xy} = \\frac{\\sum_i(x_i - \\bar{x})(y_i - \\bar{y})} {\\sqrt{\\sum_i(x_i - \\bar{x})^2} \\sqrt{\\sum_i(y_i - \\bar{y})^2}} = \\frac{\\text{cov}(x, y)}{\\sigma_x \\sigma_y} where :math:`\\text{cov}` is the sample covariance and :math:`\\sigma` is the sample standard deviation. If ``method='pearson'``, The Bayes Factor is calculated using the :py:func:`pingouin.bayesfactor_pearson` function. The `Spearman correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ is a non-parametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact negative and positive monotonic relationship, respectively. Mathematically, the Spearman correlation coefficient is defined as the Pearson correlation coefficient between the `rank variables <https://en.wikipedia.org/wiki/Ranking>`_. The `Kendall correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_ is a measure of the correspondence between two rankings. Values also range from -1 (perfect disagreement) to 1 (perfect agreement), with 0 indicating the absence of association. Consistent with :py:func:`scipy.stats.kendalltau`, Pingouin returns the Tau-b coefficient, which adjusts for ties: .. math:: \\tau_B = \\frac{(P - Q)}{\\sqrt{(P + Q + T) (P + Q + U)}} where :math:`P` is the number of concordant pairs, :math:`Q` the number of discordand pairs, :math:`T` the number of ties in x, and :math:`U` the number of ties in y. The `biweight midcorrelation <https://en.wikipedia.org/wiki/Biweight_midcorrelation>`_ and percentage bend correlation [1]_ are both robust methods that protects against *univariate* outliers by down-weighting observations that deviate too much from the median. The Shepherd pi [2]_ correlation and skipped [3]_, [4]_ correlation are both robust methods that returns the Spearman correlation coefficient after removing *bivariate* outliers. Briefly, the Shepherd pi uses a bootstrapping of the Mahalanobis distance to identify outliers, while the skipped correlation is based on the minimum covariance determinant (which requires scikit-learn). Note that these two methods are significantly slower than the previous ones. .. important:: Please note that rows with missing values (NaN) are automatically removed. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 Examples -------- 1. Pearson correlation >>> import numpy as np >>> import pingouin as pg >>> # Generate random correlated samples >>> np.random.seed(123) >>> mean, cov = [4, 6], [(1, .5), (.5, 1)] >>> x, y = np.random.multivariate_normal(mean, cov, 30).T >>> # Compute Pearson correlation >>> pg.corr(x, y).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.491 [0.16, 0.72] 0.242 0.185 0.006 8.55 0.809 2. Pearson correlation with two outliers >>> x[3], y[5] = 12, -8 >>> pg.corr(x, y).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.302 0.121 3. Spearman correlation (robust to outliers) >>> pg.corr(x, y, method="spearman").round(3) n r CI95% r2 adj_r2 p-val power spearman 30 0.401 [0.05, 0.67] 0.161 0.099 0.028 0.61 4. Biweight midcorrelation (robust) >>> pg.corr(x, y, method="bicor").round(3) n r CI95% r2 adj_r2 p-val power bicor 30 0.393 [0.04, 0.66] 0.155 0.092 0.031 0.592 5. Percentage bend correlation (robust) >>> pg.corr(x, y, method='percbend').round(3) n r CI95% r2 adj_r2 p-val power percbend 30 0.389 [0.03, 0.66] 0.151 0.089 0.034 0.581 6. Shepherd's pi correlation (robust) >>> pg.corr(x, y, method='shepherd').round(3) n outliers r CI95% r2 adj_r2 p-val power shepherd 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.02 0.694 7. Skipped spearman correlation (robust) >>> pg.corr(x, y, method='skipped').round(3) n outliers r CI95% r2 adj_r2 p-val power skipped 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.02 0.694 8. One-tailed Pearson correlation >>> pg.corr(x, y, tail="one-sided", method='pearson').round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.22 0.467 0.194 9. Using columns of a pandas dataframe >>> import pandas as pd >>> data = pd.DataFrame({'x': x, 'y': y}) >>> pg.corr(data['x'], data['y']).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.302 0.121 """ # Safety check x = np.asarray(x) y = np.asarray(y) assert x.ndim == y.ndim == 1, 'x and y must be 1D array.' assert x.size == y.size, 'x and y must have the same length.' # Remove rows with missing values x, y = remove_na(x, y, paired=True) nx = x.size # Compute correlation coefficient if method == 'pearson': r, pval = pearsonr(x, y) elif method == 'spearman': r, pval = spearmanr(x, y) elif method == 'kendall': r, pval = kendalltau(x, y) elif method == 'bicor': r, pval = bicor(x, y) elif method == 'percbend': r, pval = percbend(x, y) elif method == 'shepherd': r, pval, outliers = shepherd(x, y) elif method == 'skipped': r, pval, outliers = skipped(x, y) else: raise ValueError('Method not recognized.') if np.isnan(r): # Correlation failed -- new in version v0.3.4, instead of raising an # error we just return a dataframe full of NaN (except sample size). # This avoid sudden stop in pingouin.pairwise_corr. return pd.DataFrame( { 'n': nx, 'r': np.nan, 'CI95%': np.nan, 'r2': np.nan, 'adj_r2': np.nan, 'p-val': np.nan, 'BF10': np.nan, 'power': np.nan }, index=[method]) # Compute r2 and adj_r2 r2 = r**2 adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3)) # Compute the parametric 95% confidence interval and power ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r') pr = power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail), # Create dictionnary stats = { 'n': nx, 'r': r, 'r2': r2, 'adj_r2': adj_r2, 'CI95%': [ci], 'p-val': pval if tail == 'two-sided' else .5 * pval, 'power': pr } if method in ['shepherd', 'skipped']: stats['outliers'] = sum(outliers) # Compute the BF10 for Pearson correlation only if method == 'pearson': stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail) # Convert to DataFrame stats = pd.DataFrame.from_records(stats, index=[method]) # Define order col_keep = [ 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power' ] col_order = [k for k in col_keep if k in stats.keys().tolist()] return stats[col_order]
def normality(data, dv=None, group=None, method="shapiro", alpha=.05): """Univariate normality test. Parameters ---------- data : :py:class:`pandas.DataFrame`, series, list or 1D np.array Iterable. Can be either a single list, 1D numpy array, or a wide- or long-format pandas dataframe. dv : str Dependent variable (only when ``data`` is a long-format dataframe). group : str Grouping variable (only when ``data`` is a long-format dataframe). method : str Normality test. `'shapiro'` (default) performs the Shapiro-Wilk test using :py:func:`scipy.stats.shapiro`, and `'normaltest'` performs the omnibus test of normality using :py:func:`scipy.stats.normaltest`. The latter is more appropriate for large samples. alpha : float Significance level. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W'``: Test statistic. * ``'pval'``: p-value. * ``'normal'``: True if ``data`` is normally distributed. See Also -------- homoscedasticity : Test equality of variance. sphericity : Mauchly's test for sphericity. Notes ----- The Shapiro-Wilk test calculates a :math:`W` statistic that tests whether a random sample :math:`x_1, x_2, ..., x_n` comes from a normal distribution. The :math:`W` statistic is calculated as follows: .. math:: W = \\frac{(\\sum_{i=1}^n a_i x_{i})^2} {\\sum_{i=1}^n (x_i - \\overline{x})^2} where the :math:`x_i` are the ordered sample values (in ascending order) and the :math:`a_i` are constants generated from the means, variances and covariances of the order statistics of a sample of size :math:`n` from a standard normal distribution. Specifically: .. math:: (a_1, ..., a_n) = \\frac{m^TV^{-1}}{(m^TV^{-1}V^{-1}m)^{1/2}} with :math:`m = (m_1, ..., m_n)^T` and :math:`(m_1, ..., m_n)` are the expected values of the order statistics of independent and identically distributed random variables sampled from the standard normal distribution, and :math:`V` is the covariance matrix of those order statistics. The null-hypothesis of this test is that the population is normally distributed. Thus, if the p-value is less than the chosen alpha level (typically set at 0.05), then the null hypothesis is rejected and there is evidence that the data tested are not normally distributed. The result of the Shapiro-Wilk test should be interpreted with caution in the case of large sample sizes. Indeed, quoting from `Wikipedia <https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test>`_: *"Like most statistical significance tests, if the sample size is sufficiently large this test may detect even trivial departures from the null hypothesis (i.e., although there may be some statistically significant effect, it may be too small to be of any practical significance); thus, additional investigation of the effect size is typically advisable, e.g., a Q–Q plot in this case."* Note that missing values are automatically removed (casewise deletion). References ---------- * Shapiro, S. S., & Wilk, M. B. (1965). An analysis of variance test for normality (complete samples). Biometrika, 52(3/4), 591-611. * https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm Examples -------- 1. Shapiro-Wilk test on a 1D array. >>> import numpy as np >>> import pingouin as pg >>> np.random.seed(123) >>> x = np.random.normal(size=100) >>> pg.normality(x) W pval normal 0 0.98414 0.274886 True 2. Omnibus test on a wide-format dataframe with missing values >>> data = pg.read_dataset('mediation') >>> data.loc[1, 'X'] = np.nan >>> pg.normality(data, method='normaltest').round(3) W pval normal X 1.792 0.408 True M 0.492 0.782 True Y 0.349 0.840 True Mbin 839.716 0.000 False Ybin 814.468 0.000 False W1 24.816 0.000 False W2 43.400 0.000 False 3. Pandas Series >>> pg.normality(data['X'], method='normaltest') W pval normal X 1.791839 0.408232 True 4. Long-format dataframe >>> data = pg.read_dataset('rm_anova2') >>> pg.normality(data, dv='Performance', group='Time') W pval normal Pre 0.967718 0.478773 True Post 0.940728 0.095157 True """ assert isinstance(data, (pd.DataFrame, pd.Series, list, np.ndarray)) assert method in ['shapiro', 'normaltest'] if isinstance(data, pd.Series): data = data.to_frame() col_names = ['W', 'pval'] func = getattr(scipy.stats, method) if isinstance(data, (list, np.ndarray)): data = np.asarray(data) assert data.ndim == 1, 'Data must be 1D.' assert data.size > 3, 'Data must have more than 3 samples.' data = remove_na(data) stats = pd.DataFrame(func(data)).T stats.columns = col_names stats['normal'] = np.where(stats['pval'] > alpha, True, False) else: # Data is a Pandas DataFrame if dv is None and group is None: # Wide-format # Get numeric data only numdata = data._get_numeric_data() stats = numdata.apply(lambda x: func(x.dropna()), result_type='expand', axis=0).T stats.columns = col_names stats['normal'] = np.where(stats['pval'] > alpha, True, False) else: # Long-format stats = pd.DataFrame([]) assert group in data.columns assert dv in data.columns grp = data.groupby(group, observed=True, sort=False) cols = grp.groups.keys() for _, tmp in grp: stats = stats.append( normality(tmp[dv].to_numpy(), method=method, alpha=alpha)) stats.index = cols return _postprocess_dataframe(stats)