def friedman(data=None, dv=None, within=None, subject=None, method='chisq'): """Friedman test for repeated measurements. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame dv : string Name of column containing the dependent variable. within : string Name of column containing the within-subject factor. subject : string Name of column containing the subject identifier. method : string Statistical test to perform. Must be ``'chisq'`` (chi-square test) or ``'f'`` (F test). See notes below for explanation. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W'``: Kendall's coefficient of concordance, corrected for ties If ``method='chisq'`` * ``'Q'``: The Friedman chi-square statistic, corrected for ties * ``'dof'``: degrees of freedom * ``'p-unc'``: Uncorrected p-value of the chi squared test If ``method='f'`` * ``'F'``: The Friedman F statistic, corrected for ties * ``'dof1'``: degrees of freedom of the numerator * ``'dof2'``: degrees of freedom of the denominator * ``'p-unc'``: Uncorrected p-value of the F test Notes ----- The Friedman test is used for one-way repeated measures ANOVA by ranks. Data are expected to be in long-format. Note that if the dataset contains one or more other within subject factors, an automatic collapsing to the mean is applied on the dependent variable (same behavior as the ezANOVA R package). As such, results can differ from those of JASP. If you can, always double-check the results. NaN values are automatically removed. The Friedman test is equivalent to the test of significance of Kendalls's coefficient of concordance (Kendall's W). Most commonly a Q statistic, which has asymptotical chi-squared distribution, is computed and used for testing. However, in [1]_ they showed the chi-squared test to be overly conservative for small numbers of samples and repeated measures. Instead they recommend the F test, which has the correct size and behaves like a permutation test, but is computationaly much easier. References ---------- .. [1] Marozzi, M. (2014). Testing for concordance between several criteria. Journal of Statistical Computation and Simulation, 84(9), 1843–1850. https://doi.org/10.1080/00949655.2013.766189 Examples -------- Compute the Friedman test for repeated measurements. >>> from pingouin import friedman, read_dataset >>> df = read_dataset('rm_anova') >>> friedman(data=df, dv='DesireToKill', within='Disgustingness', ... subject='Subject') Source W ddof1 Q p-unc Friedman Disgustingness 0.099224 1 9.227848 0.002384 This time we will use the F test method. >>> from pingouin import friedman, read_dataset >>> df = read_dataset('rm_anova') >>> friedman(data=df, dv='DesireToKill', within='Disgustingness', ... subject='Subject', method='f') Source W ddof1 ddof2 F p-unc Friedman Disgustingness 0.099224 0.978495 90.021505 10.13418 0.002138 We can see, compared to the previous example, that the p-value is slightly lower. This is expected, since the F test is more powerful (see Notes). """ # Check data _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') # Convert Categorical columns to string # This is important otherwise all the groupby will return different results # unless we specify .groupby(..., observed = True). for c in [subject, within]: if data[c].dtype.name == 'category': data[c] = data[c].astype(str) # Collapse to the mean data = data.groupby([subject, within]).mean().reset_index() # Remove NaN if data[dv].isnull().any(): data = remove_rm_na(dv=dv, within=within, subject=subject, data=data[[subject, within, dv]]) # Extract number of groups and total sample size grp = data.groupby(within)[dv] rm = list(data[within].unique()) k = len(rm) X = np.array([grp.get_group(r).to_numpy() for r in rm]).T n = X.shape[0] # Rank per subject ranked = np.zeros(X.shape) for i in range(n): ranked[i] = scipy.stats.rankdata(X[i, :]) ssbn = (ranked.sum(axis=0)**2).sum() # Correction for ties ties = 0 for i in range(n): replist, repnum = scipy.stats.find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) # Compute Kendall's W corrected for ties W = (12 * ssbn - 3 * n * n * k * (k + 1) * (k + 1)) / (n * n * k * (k - 1) * (k + 1) - n * ties) if method == 'chisq': # Compute the Q statistic Q = n * (k - 1) * W # Approximate the p-value ddof1 = k - 1 p_unc = scipy.stats.chi2.sf(Q, ddof1) # Create output dataframe stats = pd.DataFrame({'Source': within, 'W': W, 'ddof1': ddof1, 'Q': Q, 'p-unc': p_unc, }, index=['Friedman']) elif method == 'f': # Compute the F statistic F = W * (n - 1) / (1 - W) # Approximate the p-value ddof1 = k - 1 - 2 / n ddof2 = (n - 1) * ddof1 p_unc = scipy.stats.f.sf(F, ddof1, ddof2) # Create output dataframe stats = pd.DataFrame({'Source': within, 'W': W, 'ddof1': ddof1, 'ddof2': ddof2, 'F': F, 'p-unc': p_unc, }, index=['Friedman']) return _postprocess_dataframe(stats)
def cochran(data=None, dv=None, within=None, subject=None): """Cochran Q test. A special case of the Friedman test when the dependent variable is binary. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame dv : string Name of column containing the binary dependent variable. within : string Name of column containing the within-subject factor. subject : string Name of column containing the subject identifier. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'Q'``: The Cochran Q statistic * ``'p-unc'``: Uncorrected p-value * ``'dof'``: degrees of freedom Notes ----- The Cochran Q test [1]_ is a non-parametric test for ANOVA with repeated measures where the dependent variable is binary. Data are expected to be in long-format. NaN are automatically removed from the data. The Q statistics is defined as: .. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2} where :math:`N` is the total sum of all observations, :math:`j=1,...,r` where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where :math:`n` is the number of observations per condition. The p-value is then approximated using a chi-square distribution with :math:`r-1` degrees of freedom: .. math:: Q \\sim \\chi^2(r-1) References ---------- .. [1] Cochran, W.G., 1950. The comparison of percentages in matched samples. Biometrika 37, 256–266. https://doi.org/10.1093/biomet/37.3-4.256 Examples -------- Compute the Cochran Q test for repeated measurements. >>> from pingouin import cochran, read_dataset >>> df = read_dataset('cochran') >>> cochran(data=df, dv='Energetic', within='Time', subject='Subject') Source dof Q p-unc cochran Time 2 6.705882 0.034981 """ # Check data _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') # Convert Categorical columns to string # This is important otherwise all the groupby will return different results # unless we specify .groupby(..., observed = True). for c in [subject, within]: if data[c].dtype.name == 'category': data[c] = data[c].astype(str) # Remove NaN if data[dv].isnull().any(): data = remove_rm_na(dv=dv, within=within, subject=subject, data=data[[subject, within, dv]]) # Groupby and extract size grp = data.groupby(within)[dv] grp_s = data.groupby(subject)[dv] k = data[within].nunique() dof = k - 1 # n = grp.count().unique()[0] # Q statistic and p-value q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \ (k * grp.sum().sum() - np.sum(grp_s.sum()**2)) p_unc = scipy.stats.chi2.sf(q, dof) # Create output dataframe stats = pd.DataFrame({'Source': within, 'dof': dof, 'Q': q, 'p-unc': p_unc, }, index=['cochran']) return _postprocess_dataframe(stats)
def wilcoxon(x, y, tail='two-sided'): """Wilcoxon signed-rank test. It is the non-parametric version of the paired T-test. Parameters ---------- x, y : array_like First and second set of observations. ``x`` and ``y`` must be related (e.g repeated measures) and, therefore, have the same number of samples. Note that a listwise deletion of missing values is automatically applied. tail : string Specify whether to return `'one-sided'` or `'two-sided'` p-value. Can also be `'greater'` or `'less'` to specify the direction of the test. If ``tail='one-sided'``, the alternative of the test will be automatically detected by looking at the sign of the median of the differences between ``x`` and ``y``. For instance, if ``np.median(x - y) > 0`` and ``tail='one-sided'``, Pingouin will automatically set ``tail='greater'`` and vice versa. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W-val'``: W-value * ``'p-val'``: p-value * ``'RBC'`` : matched pairs rank-biserial correlation (effect size) * ``'CLES'`` : common language effect size See also -------- scipy.stats.wilcoxon, mwu Notes ----- The Wilcoxon signed-rank test [1]_ tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. A continuity correction is applied by default (see :py:func:`scipy.stats.wilcoxon` for details). The matched pairs rank biserial correlation [2]_ is the simple difference between the proportion of favorable and unfavorable evidence; in the case of the Wilcoxon signed-rank test, the evidence consists of rank sums (Kerby 2014): .. math:: r = f - u The common language effect size is the proportion of pairs where ``x`` is higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_. Pingouin uses a brute-force version of the formula given by Vargha and Delaney 2000 [4]_: .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y) The advantage is of this method are twofold. First, the brute-force approach pairs each observation of ``x`` to its ``y`` counterpart, and therefore does not require normally distributed data. Second, the formula takes ties into account and therefore works with ordinal data. When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`, which gives the proportion of pairs where ``x`` is *lower* than ``y``. References ---------- .. [1] Wilcoxon, F. (1945). Individual comparisons by ranking methods. Biometrics bulletin, 1(6), 80-83. .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to teaching nonparametric correlation. Comprehensive Psychology, 3, 11-IT. .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size statistic. Psychological bulletin, 111(2), 361. .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of the “CL” Common Language Effect Size Statistics of McGraw and Wong. Journal of Educational and Behavioral Statistics: A Quarterly Publication Sponsored by the American Educational Research Association and the American Statistical Association, 25(2), 101–132. https://doi.org/10.2307/1165329 Examples -------- Wilcoxon test on two related samples. >>> import numpy as np >>> import pingouin as pg >>> x = [20, 22, 19, 20, 22, 18, 24, 20, 19, 24, 26, 13] >>> y = [38, 37, 33, 29, 14, 12, 20, 22, 17, 25, 26, 16] >>> pg.wilcoxon(x, y, tail='two-sided') W-val tail p-val RBC CLES Wilcoxon 20.5 two-sided 0.285765 -0.378788 0.395833 Compare with SciPy >>> import scipy >>> scipy.stats.wilcoxon(x, y, correction=True) WilcoxonResult(statistic=20.5, pvalue=0.2857652190231508) One-sided tail: one can either manually specify the alternative hypothesis >>> pg.wilcoxon(x, y, tail='greater') W-val tail p-val RBC CLES Wilcoxon 20.5 greater 0.876244 -0.378788 0.395833 >>> pg.wilcoxon(x, y, tail='less') W-val tail p-val RBC CLES Wilcoxon 20.5 less 0.142883 -0.378788 0.604167 Or simply leave it to Pingouin, using the `'one-sided'` argument, in which case Pingouin will look at the sign of the median of the differences between ``x`` and ``y`` and ajust the tail based on that: >>> np.median(np.array(x) - np.array(y)) -1.5 The median is negative, so Pingouin will test for the alternative hypothesis that the median of the differences is negative (= less than 0). >>> pg.wilcoxon(x, y, tail='one-sided') # Equivalent to tail = 'less' W-val tail p-val RBC CLES Wilcoxon 20.5 less 0.142883 -0.378788 0.604167 """ x = np.asarray(x) y = np.asarray(y) x, y = remove_na(x, y, paired=True) # Remove NA # Check tails possible_tails = ['two-sided', 'one-sided', 'greater', 'less'] assert tail in possible_tails, 'Invalid tail argument.' if tail == 'one-sided': # Detect the direction of the test based on the median tail = 'less' if np.median(x - y) < 0 else 'greater' # Compute test wval, pval = scipy.stats.wilcoxon(x, y, zero_method='wilcox', correction=True, alternative=tail) # Effect size 1: Common Language Effect Size # Since Pingouin v0.3.5, CLES is tail-specific and calculated # according to the formula given in Vargha and Delaney 2000 which # works with ordinal data. diff = x[:, None] - y # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size # Tail = 'greater', with ties set to 0.5 # Note that tail = 'two-sided' gives same output as tail = 'greater' cles = np.where(diff == 0, 0.5, diff > 0).mean() cles = 1 - cles if tail == 'less' else cles # Effect size 2: matched-pairs rank biserial correlation (Kerby 2014) d = x - y d = d[d != 0] r = scipy.stats.rankdata(abs(d)) rsum = r.sum() r_plus = np.sum((d > 0) * r) r_minus = np.sum((d < 0) * r) rbc = r_plus / rsum - r_minus / rsum # Fill output DataFrame stats = pd.DataFrame({ 'W-val': wval, 'tail': tail, 'p-val': pval, 'RBC': rbc, 'CLES': cles}, index=['Wilcoxon']) return _postprocess_dataframe(stats)
def kruskal(data=None, dv=None, between=None, detailed=False): """Kruskal-Wallis H-test for independent samples. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame dv : string Name of column containing the dependent variable. between : string Name of column containing the between factor. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'H'``: The Kruskal-Wallis H statistic, corrected for ties * ``'p-unc'``: Uncorrected p-value * ``'dof'``: degrees of freedom Notes ----- The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal. It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes. Due to the assumption that H has a chi square distribution, the number of samples in each group must not be too small. A typical rule is that each sample must have at least 5 measurements. NaN values are automatically removed. Examples -------- Compute the Kruskal-Wallis H-test for independent samples. >>> from pingouin import kruskal, read_dataset >>> df = read_dataset('anova') >>> kruskal(data=df, dv='Pain threshold', between='Hair color') Source ddof1 H p-unc Kruskal Hair color 3 10.58863 0.014172 """ # Check data _check_dataframe(dv=dv, between=between, data=data, effects='between') # Remove NaN values data = data[[dv, between]].dropna() # Reset index (avoid duplicate axis error) data = data.reset_index(drop=True) # Extract number of groups and total sample size n_groups = data[between].nunique() n = data[dv].size # Rank data, dealing with ties appropriately data['rank'] = scipy.stats.rankdata(data[dv]) # Find the total of rank per groups grp = data.groupby(between, observed=True)['rank'] sum_rk_grp = grp.sum().to_numpy() n_per_grp = grp.count().to_numpy() # Calculate chi-square statistic (H) H = (12 / (n * (n + 1)) * np.sum(sum_rk_grp**2 / n_per_grp)) - 3 * (n + 1) # Correct for ties H /= scipy.stats.tiecorrect(data['rank'].to_numpy()) # Calculate DOF and p-value ddof1 = n_groups - 1 p_unc = scipy.stats.chi2.sf(H, ddof1) # Create output dataframe stats = pd.DataFrame({'Source': between, 'ddof1': ddof1, 'H': H, 'p-unc': p_unc, }, index=['Kruskal']) return _postprocess_dataframe(stats)
def cochran(data=None, dv=None, within=None, subject=None): """Cochran Q test. A special case of the Friedman test when the dependent variable is binary. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame. Both wide and long-format dataframe are supported for this test. dv : string Name of column containing the dependent variable (only required if ``data`` is in long format). within : string Name of column containing the within-subject factor (only required if ``data`` is in long format). Two or more within-factor are not currently supported. subject : string Name of column containing the subject/rater identifier (only required if ``data`` is in long format). Returns ------- stats : :py:class:`pandas.DataFrame` * ``'Q'``: The Cochran Q statistic * ``'p-unc'``: Uncorrected p-value * ``'dof'``: degrees of freedom Notes ----- The Cochran Q test [1]_ is a non-parametric test for ANOVA with repeated measures where the dependent variable is binary. The Q statistics is defined as: .. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2} where :math:`N` is the total sum of all observations, :math:`j=1,...,r` where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where :math:`n` is the number of observations per condition. The p-value is then approximated using a chi-square distribution with :math:`r-1` degrees of freedom: .. math:: Q \\sim \\chi^2(r-1) Data are expected to be in long-format. Missing values are automatically removed using a strict listwise approach (= complete-case analysis). In other words, any subject with one or more missing value(s) is completely removed from the dataframe prior to running the test. References ---------- .. [1] Cochran, W.G., 1950. The comparison of percentages in matched samples. Biometrika 37, 256–266. https://doi.org/10.1093/biomet/37.3-4.256 Examples -------- Compute the Cochran Q test for repeated measurements. >>> from pingouin import cochran, read_dataset >>> df = read_dataset('cochran') >>> cochran(data=df, dv='Energetic', within='Time', subject='Subject') Source dof Q p-unc cochran Time 2 6.705882 0.034981 Same but using a wide-format dataframe >>> df_wide = df.pivot_table(index="Subject", columns="Time", values="Energetic") >>> cochran(df_wide) Source dof Q p-unc cochran Within 2 6.705882 0.034981 """ # Convert from wide to long-format, if needed if all([v is None for v in [dv, within, subject]]): assert isinstance(data, pd.DataFrame) data = data._get_numeric_data().dropna() # Listwise deletion of missing values assert data.shape[0] > 2, "Data must have at least 3 non-missing rows." assert data.shape[1] > 1, "Data must contain at least two columns." data['Subj'] = np.arange(data.shape[0]) data = data.melt(id_vars='Subj', var_name='Within', value_name='DV') subject, within, dv = 'Subj', 'Within', 'DV' # Check data _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') assert not data[within].isnull().any(), "Cannot have missing values in `within`." assert not data[subject].isnull().any(), "Cannot have missing values in `subject`." # Pivot and melt the table. This has several effects: # 1) Force missing values to be explicit (a NaN cell is created) # 2) Automatic collapsing to the mean if multiple within factors are present # 3) If using dropna, remove rows with missing values (listwise deletion). # The latter is the same behavior as JASP (= strict complete-case analysis). data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True) data_piv = data_piv.dropna() data = data_piv.melt(ignore_index=False, value_name=dv).reset_index() # Groupby and extract size grp = data.groupby(within, observed=True)[dv] grp_s = data.groupby(subject, observed=True)[dv] k = data[within].nunique() dof = k - 1 # n = grp.count().unique()[0] # Q statistic and p-value q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \ (k * grp.sum().sum() - np.sum(grp_s.sum()**2)) p_unc = scipy.stats.chi2.sf(q, dof) # Create output dataframe stats = pd.DataFrame({'Source': within, 'dof': dof, 'Q': q, 'p-unc': p_unc}, index=['cochran']) return _postprocess_dataframe(stats)
def mwu(x, y, tail='two-sided'): """Mann-Whitney U Test (= Wilcoxon rank-sum test). It is the non-parametric version of the independent T-test. Parameters ---------- x, y : array_like First and second set of observations. ``x`` and ``y`` must be independent. tail : string Specify whether to return `'one-sided'` or `'two-sided'` p-value. Can also be `'greater'` or `'less'` to specify the direction of the test. If ``tail='one-sided'``, the alternative of the test will be automatically detected by comparing the medians of ``x`` and ``y``. For instance, if median(``x``) < median(``y``) and ``tail='one-sided'``, Pingouin will automatically set ``tail='less'``, and vice versa. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'U-val'``: U-value * ``'p-val'``: p-value * ``'RBC'`` : rank-biserial correlation * ``'CLES'`` : common language effect size See also -------- scipy.stats.mannwhitneyu, wilcoxon, ttest Notes ----- The Mann–Whitney U test [1]_ (also called Wilcoxon rank-sum test) is a non-parametric test of the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample. The test assumes that the two samples are independent. This test corrects for ties and by default uses a continuity correction (see :py:func:`scipy.stats.mannwhitneyu` for details). The rank biserial correlation [2]_ is the difference between the proportion of favorable evidence minus the proportion of unfavorable evidence. The common language effect size is the proportion of pairs where ``x`` is higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_. Pingouin uses a brute-force version of the formula given by Vargha and Delaney 2000 [4]_: .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y) The advantage is of this method are twofold. First, the brute-force approach pairs each observation of ``x`` to its ``y`` counterpart, and therefore does not require normally distributed data. Second, the formula takes ties into account and therefore works with ordinal data. When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`, which gives the proportion of pairs where ``x`` is *lower* than ``y``. References ---------- .. [1] Mann, H. B., & Whitney, D. R. (1947). On a test of whether one of two random variables is stochastically larger than the other. The annals of mathematical statistics, 50-60. .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to teaching nonparametric correlation. Comprehensive Psychology, 3, 11-IT. .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size statistic. Psychological bulletin, 111(2), 361. .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of the “CL” Common Language Effect Size Statistics of McGraw and Wong. Journal of Educational and Behavioral Statistics: A Quarterly Publication Sponsored by the American Educational Research Association and the American Statistical Association, 25(2), 101–132. https://doi.org/10.2307/1165329 Examples -------- >>> import numpy as np >>> import pingouin as pg >>> np.random.seed(123) >>> x = np.random.uniform(low=0, high=1, size=20) >>> y = np.random.uniform(low=0.2, high=1.2, size=20) >>> pg.mwu(x, y, tail='two-sided') U-val tail p-val RBC CLES MWU 97.0 two-sided 0.00556 0.515 0.2425 Compare with SciPy >>> import scipy >>> scipy.stats.mannwhitneyu(x, y, use_continuity=True, ... alternative='two-sided') MannwhitneyuResult(statistic=97.0, pvalue=0.0055604599321374135) One-sided tail: one can either manually specify the alternative hypothesis >>> pg.mwu(x, y, tail='greater') U-val tail p-val RBC CLES MWU 97.0 greater 0.997442 0.515 0.2425 >>> pg.mwu(x, y, tail='less') U-val tail p-val RBC CLES MWU 97.0 less 0.00278 0.515 0.7575 Or simply leave it to Pingouin, using the `'one-sided'` argument, in which case Pingouin will compare the medians of ``x`` and ``y`` and select the most appropriate tail based on that: >>> # Since np.median(x) < np.median(y), this is equivalent to tail='less' >>> pg.mwu(x, y, tail='one-sided') U-val tail p-val RBC CLES MWU 97.0 less 0.00278 0.515 0.7575 """ x = np.asarray(x) y = np.asarray(y) # Remove NA x, y = remove_na(x, y, paired=False) # Check tails possible_tails = ['two-sided', 'one-sided', 'greater', 'less'] assert tail in possible_tails, 'Invalid tail argument.' if tail == 'one-sided': # Detect the direction of the test based on the median tail = 'less' if np.median(x) < np.median(y) else 'greater' uval, pval = scipy.stats.mannwhitneyu(x, y, use_continuity=True, alternative=tail) # Effect size 1: Common Language Effect Size # CLES is tail-specific and calculated according to the formula given in # Vargha and Delaney 2000 which works with ordinal data. diff = x[:, None] - y # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size # Tail = 'greater', with ties set to 0.5 # Note that tail = 'two-sided' gives same output as tail = 'greater' cles = np.where(diff == 0, 0.5, diff > 0).mean() cles = 1 - cles if tail == 'less' else cles # Effect size 2: rank biserial correlation (Wendt 1972) rbc = 1 - (2 * uval) / diff.size # diff.size = x.size * y.size # Fill output DataFrame stats = pd.DataFrame({ 'U-val': uval, 'tail': tail, 'p-val': pval, 'RBC': rbc, 'CLES': cles}, index=['MWU']) return _postprocess_dataframe(stats)
def friedman(data=None, dv=None, within=None, subject=None, method='chisq'): """Friedman test for repeated measurements. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame. Both wide and long-format dataframe are supported for this test. dv : string Name of column containing the dependent variable (only required if ``data`` is in long format). within : string Name of column containing the within-subject factor (only required if ``data`` is in long format). Two or more within-factor are not currently supported. subject : string Name of column containing the subject/rater identifier (only required if ``data`` is in long format). method : string Statistical test to perform. Must be ``'chisq'`` (chi-square test) or ``'f'`` (F test). See notes below for explanation. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W'``: Kendall's coefficient of concordance, corrected for ties If ``method='chisq'`` * ``'Q'``: The Friedman chi-square statistic, corrected for ties * ``'dof'``: degrees of freedom * ``'p-unc'``: Uncorrected p-value of the chi squared test If ``method='f'`` * ``'F'``: The Friedman F statistic, corrected for ties * ``'dof1'``: degrees of freedom of the numerator * ``'dof2'``: degrees of freedom of the denominator * ``'p-unc'``: Uncorrected p-value of the F test Notes ----- The Friedman test is used for non-parametric (rank-based) one-way repeated measures ANOVA. It is equivalent to the test of significance of Kendalls's coefficient of concordance (Kendall's W). Most commonly a Q statistic, which has asymptotical chi-squared distribution, is computed and used for testing. However, the chi-squared test tend to be overly conservative for small numbers of samples and/or repeated measures, in which case a F-test is more adequate [1]_. Data can be in wide or long format. Missing values are automatically removed using a strict listwise approach (= complete-case analysis). In other words, any subject with one or more missing value(s) is completely removed from the dataframe prior to running the test. References ---------- .. [1] Marozzi, M. (2014). Testing for concordance between several criteria. Journal of Statistical Computation and Simulation, 84(9), 1843–1850. https://doi.org/10.1080/00949655.2013.766189 .. [2] https://www.real-statistics.com/anova-repeated-measures/friedman-test/ Examples -------- Compute the Friedman test for repeated measurements, using a wide-format dataframe >>> import pandas as pd >>> import pingouin as pg >>> df = pd.DataFrame({ ... 'white': {0: 10, 1: 8, 2: 7, 3: 9, 4: 7, 5: 4, 6: 5, 7: 6, 8: 5, 9: 10, 10: 4, 11: 7}, ... 'red': {0: 7, 1: 5, 2: 8, 3: 6, 4: 5, 5: 7, 6: 9, 7: 6, 8: 4, 9: 6, 10: 7, 11: 3}, ... 'rose': {0: 8, 1: 5, 2: 6, 3: 4, 4: 7, 5: 5, 6: 3, 7: 7, 8: 6, 9: 4, 10: 4, 11: 3}}) >>> pg.friedman(df) Source W ddof1 Q p-unc Friedman Within 0.083333 2 2.0 0.367879 Compare with SciPy >>> from scipy.stats import friedmanchisquare >>> friedmanchisquare(*df.to_numpy().T) FriedmanchisquareResult(statistic=1.9999999999999893, pvalue=0.3678794411714444) Using a long-format dataframe >>> df_long = df.melt(ignore_index=False).reset_index() >>> pg.friedman(data=df_long, dv="value", within="variable", subject="index") Source W ddof1 Q p-unc Friedman variable 0.083333 2 2.0 0.367879 Using the F-test method >>> pg.friedman(df, method="f") Source W ddof1 ddof2 F p-unc Friedman Within 0.083333 1.833333 20.166667 1.0 0.378959 """ # Convert from wide to long-format, if needed if all([v is None for v in [dv, within, subject]]): assert isinstance(data, pd.DataFrame) data = data._get_numeric_data().dropna() # Listwise deletion of missing values assert data.shape[0] > 2, "Data must have at least 3 non-missing rows." assert data.shape[1] > 1, "Data must contain at least two columns." data['Subj'] = np.arange(data.shape[0]) data = data.melt(id_vars='Subj', var_name='Within', value_name='DV') subject, within, dv = 'Subj', 'Within', 'DV' # Check dataframe _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') assert not data[within].isnull().any(), "Cannot have missing values in `within`." assert not data[subject].isnull().any(), "Cannot have missing values in `subject`." # Pivot the table to a wide-format dataframe. This has several effects: # 1) Force missing values to be explicit (a NaN cell is created) # 2) Automatic collapsing to the mean if multiple within factors are present # 3) If using dropna, remove rows with missing values (listwise deletion). # The latter is the same behavior as JASP (= strict complete-case analysis). data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True) data_piv = data_piv.dropna() # Extract data in numpy array and calculate ranks X = data_piv.to_numpy() n, k = X.shape ranked = scipy.stats.rankdata(X, axis=1) ssbn = (ranked.sum(axis=0)**2).sum() # Correction for ties ties = 0 for i in range(n): replist, repnum = scipy.stats.find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) # Compute Kendall's W corrected for ties W = (12 * ssbn - 3 * n**2 * k * (k + 1)**2) / (n**2 * k * (k - 1) * (k + 1) - n * ties) if method == 'chisq': # Compute the Q statistic Q = n * (k - 1) * W # Approximate the p-value ddof1 = k - 1 p_unc = scipy.stats.chi2.sf(Q, ddof1) # Create output dataframe stats = pd.DataFrame({ 'Source': within, 'W': W, 'ddof1': ddof1, 'Q': Q, 'p-unc': p_unc}, index=['Friedman']) elif method == 'f': # Compute the F statistic F = W * (n - 1) / (1 - W) # Approximate the p-value ddof1 = k - 1 - 2 / n ddof2 = (n - 1) * ddof1 p_unc = scipy.stats.f.sf(F, ddof1, ddof2) # Create output dataframe stats = pd.DataFrame({ 'Source': within, 'W': W, 'ddof1': ddof1, 'ddof2': ddof2, 'F': F, 'p-unc': p_unc}, index=['Friedman']) return _postprocess_dataframe(stats)
def wilcoxon(x, y=None, alternative='two-sided', **kwargs): """ Wilcoxon signed-rank test. It is the non-parametric version of the paired T-test. Parameters ---------- x : array_like Either the first set of measurements (in which case y is the second set of measurements), or the differences between two sets of measurements (in which case y is not to be specified.) Must be one-dimensional. y : array_like Either the second set of measurements (if x is the first set of measurements), or not specified (if x is the differences between two sets of measurements.) Must be one-dimensional. alternative : string Defines the alternative hypothesis, or tail of the test. Must be one of "two-sided" (default), "greater" or "less". See :py:func:`scipy.stats.wilcoxon` for more details. **kwargs : dict Additional keywords arguments that are passed to :py:func:`scipy.stats.wilcoxon`. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W-val'``: W-value * ``'alternative'``: tail of the test * ``'p-val'``: p-value * ``'RBC'`` : matched pairs rank-biserial correlation (effect size) * ``'CLES'`` : common language effect size See also -------- scipy.stats.wilcoxon, mwu Notes ----- The Wilcoxon signed-rank test [1]_ tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. .. important:: Pingouin automatically applies a continuity correction. Therefore, the p-values will be slightly different than :py:func:`scipy.stats.wilcoxon` unless ``correction=True`` is explicitly passed to the latter. In addition to the test statistic and p-values, Pingouin also computes two measures of effect size. The matched pairs rank biserial correlation [2]_ is the simple difference between the proportion of favorable and unfavorable evidence; in the case of the Wilcoxon signed-rank test, the evidence consists of rank sums (Kerby 2014): .. math:: r = f - u The common language effect size is the proportion of pairs where ``x`` is higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_. Pingouin uses a brute-force version of the formula given by Vargha and Delaney 2000 [4]_: .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y) The advantage is of this method are twofold. First, the brute-force approach pairs each observation of ``x`` to its ``y`` counterpart, and therefore does not require normally distributed data. Second, the formula takes ties into account and therefore works with ordinal data. When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`, which gives the proportion of pairs where ``x`` is *lower* than ``y``. References ---------- .. [1] Wilcoxon, F. (1945). Individual comparisons by ranking methods. Biometrics bulletin, 1(6), 80-83. .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to teaching nonparametric correlation. Comprehensive Psychology, 3, 11-IT. .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size statistic. Psychological bulletin, 111(2), 361. .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of the “CL” Common Language Effect Size Statistics of McGraw and Wong. Journal of Educational and Behavioral Statistics: A Quarterly Publication Sponsored by the American Educational Research Association and the American Statistical Association, 25(2), 101–132. https://doi.org/10.2307/1165329 Examples -------- Wilcoxon test on two related samples. >>> import numpy as np >>> import pingouin as pg >>> x = np.array([20, 22, 19, 20, 22, 18, 24, 20, 19, 24, 26, 13]) >>> y = np.array([38, 37, 33, 29, 14, 12, 20, 22, 17, 25, 26, 16]) >>> pg.wilcoxon(x, y, alternative='two-sided') W-val alternative p-val RBC CLES Wilcoxon 20.5 two-sided 0.285765 -0.378788 0.395833 Same but using pre-computed differences. However, the CLES effect size cannot be computed as it requires the raw data. >>> pg.wilcoxon(x - y) W-val alternative p-val RBC CLES Wilcoxon 20.5 two-sided 0.285765 -0.378788 NaN Compare with SciPy >>> import scipy >>> scipy.stats.wilcoxon(x, y) WilcoxonResult(statistic=20.5, pvalue=0.2661660677806492) The p-value is not exactly similar to Pingouin. This is because Pingouin automatically applies a continuity correction. Disabling it gives the same p-value as scipy: >>> pg.wilcoxon(x, y, alternative='two-sided', correction=False) W-val alternative p-val RBC CLES Wilcoxon 20.5 two-sided 0.266166 -0.378788 0.395833 One-sided test >>> pg.wilcoxon(x, y, alternative='greater') W-val alternative p-val RBC CLES Wilcoxon 20.5 greater 0.876244 -0.378788 0.395833 >>> pg.wilcoxon(x, y, alternative='less') W-val alternative p-val RBC CLES Wilcoxon 20.5 less 0.142883 -0.378788 0.604167 """ x = np.asarray(x) if y is not None: y = np.asarray(y) x, y = remove_na(x, y, paired=True) # Remove NA else: x = x[~np.isnan(x)] # Check tails assert alternative in ['two-sided', 'greater', 'less'], ( "Alternative must be one of 'two-sided' (default), 'greater' or 'less'.") if "tail" in kwargs: raise ValueError( "Since Pingouin 0.4.0, the 'tail' argument has been renamed to 'alternative'.") # Compute test if "correction" not in kwargs: kwargs["correction"] = True wval, pval = scipy.stats.wilcoxon(x=x, y=y, alternative=alternative, **kwargs) # Effect size 1: Common Language Effect Size # Since Pingouin v0.3.5, CLES is tail-specific and calculated # according to the formula given in Vargha and Delaney 2000 which # works with ordinal data. if y is not None: diff = x[:, None] - y # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size # alternative = 'greater', with ties set to 0.5 # Note that alternative = 'two-sided' gives same output as alternative = 'greater' cles = np.where(diff == 0, 0.5, diff > 0).mean() cles = 1 - cles if alternative == 'less' else cles else: # CLES cannot be computed if y is None cles = np.nan # Effect size 2: matched-pairs rank biserial correlation (Kerby 2014) if y is not None: d = x - y d = d[d != 0] else: d = x[x != 0] r = scipy.stats.rankdata(abs(d)) rsum = r.sum() r_plus = np.sum((d > 0) * r) r_minus = np.sum((d < 0) * r) rbc = r_plus / rsum - r_minus / rsum # Fill output DataFrame stats = pd.DataFrame({ 'W-val': wval, 'alternative': alternative, 'p-val': pval, 'RBC': rbc, 'CLES': cles}, index=['Wilcoxon']) return _postprocess_dataframe(stats)
def friedman(data=None, dv=None, within=None, subject=None): """Friedman test for repeated measurements. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame dv : string Name of column containing the dependent variable. within : string Name of column containing the within-subject factor. subject : string Name of column containing the subject identifier. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'Q'``: The Friedman Q statistic, corrected for ties * ``'p-unc'``: Uncorrected p-value * ``'dof'``: degrees of freedom Notes ----- The Friedman test is used for one-way repeated measures ANOVA by ranks. Data are expected to be in long-format. Note that if the dataset contains one or more other within subject factors, an automatic collapsing to the mean is applied on the dependent variable (same behavior as the ezANOVA R package). As such, results can differ from those of JASP. If you can, always double-check the results. Due to the assumption that the test statistic has a chi squared distribution, the p-value is only reliable for n > 10 and more than 6 repeated measurements. NaN values are automatically removed. Examples -------- Compute the Friedman test for repeated measurements. >>> from pingouin import friedman, read_dataset >>> df = read_dataset('rm_anova') >>> friedman(data=df, dv='DesireToKill', within='Disgustingness', ... subject='Subject') Source ddof1 Q p-unc Friedman Disgustingness 1 9.227848 0.002384 """ # Check data _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within') # Convert Categorical columns to string # This is important otherwise all the groupby will return different results # unless we specify .groupby(..., observed = True). for c in [subject, within]: if data[c].dtype.name == 'category': data[c] = data[c].astype(str) # Collapse to the mean data = data.groupby([subject, within]).mean().reset_index() # Remove NaN if data[dv].isnull().any(): data = remove_rm_na(dv=dv, within=within, subject=subject, data=data[[subject, within, dv]]) # Extract number of groups and total sample size grp = data.groupby(within)[dv] rm = list(data[within].unique()) k = len(rm) X = np.array([grp.get_group(r).to_numpy() for r in rm]).T n = X.shape[0] # Rank per subject ranked = np.zeros(X.shape) for i in range(n): ranked[i] = scipy.stats.rankdata(X[i, :]) ssbn = (ranked.sum(axis=0)**2).sum() # Compute the test statistic Q = (12 / (n * k * (k + 1))) * ssbn - 3 * n * (k + 1) # Correct for ties ties = 0 for i in range(n): replist, repnum = scipy.stats.find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) c = 1 - ties / float(k * (k * k - 1) * n) Q /= c # Approximate the p-value ddof1 = k - 1 p_unc = scipy.stats.chi2.sf(Q, ddof1) # Create output dataframe stats = pd.DataFrame( { 'Source': within, 'ddof1': ddof1, 'Q': Q, 'p-unc': p_unc, }, index=['Friedman']) return _postprocess_dataframe(stats)
def chi2_mcnemar(data, x, y, correction=True): """ Performs the exact and approximated versions of McNemar's test. Parameters ---------- data : :py:class:`pandas.DataFrame` The dataframe containing the ocurrences for the test. Each row must represent either a subject or a pair of subjects. x, y : string The variables names for the McNemar's test. Must be names of columns in ``data``. If each row of ``data`` represents a subject, then ``x`` and ``y`` must be columns containing dichotomous measurements in two different contexts. For instance: the presence of pain before and after a certain treatment. If each row of ``data`` represents a pair of subjects, then ``x`` and ``y`` must be columns containing dichotomous measurements for each of the subjects. For instance: a positive response to a certain drug in the control group and in the test group, supposing that each pair contains a subject in each group. The 2x2 crosstab is created using the :py:func:`pingouin.dichotomous_crosstab` function. .. warning:: Missing values are not allowed. correction : bool Whether to apply the correction for continuity (Edwards, A. 1948). Returns ------- observed : :py:class:`pandas.DataFrame` The observed contingency table of frequencies. stats : :py:class:`pandas.DataFrame` The test summary: * ``'chi2'``: The test statistic * ``'dof'``: The degree of freedom * ``'p-approx'``: The approximated p-value * ``'p-exact'``: The exact p-value Notes ----- The McNemar's test is compatible with dichotomous paired data, generally used to assert the effectiveness of a certain procedure, such as a treatment or the use of a drug. "Dichotomous" means that the values of the measurements are binary. "Paired data" means that each measurement is done twice, either on the same subject in two different moments or in two similar (paired) subjects from different groups (e.g.: control/test). In order to better understand the idea behind McNemar's test, let's illustrate it with an example. Suppose that we wanted to compare the effectiveness of two different treatments (X and Y) for athlete's foot on a certain group of `n` people. To achieve this, we measured their responses to such treatments on each foot. The observed data summary was: * Number of people with good responses to X and Y: `a` * Number of people with good response to X and bad response to Y: `b` * Number of people with bad response to X and good response to Y: `c` * Number of people with bad responses to X and Y: `d` Now consider the two groups: 1. The group of people who had good response to X (`a` + `b` subjects) 2. The group of people who had good response to Y (`a` + `c` subjects) If the treatments have the same effectiveness, we should expect the probabilities of having good responses to be the same, regardless of the treatment. Mathematically, such statement can be translated into the following equation: .. math:: \\frac{a+b}{n} = \\frac{a+c}{n} \\Rightarrow b = c Thus, this test should indicate higher statistical significances for higher distances between `b` and `c` (McNemar, Q. 1947): .. math:: \\chi^2 = \\frac{(b - c)^2}{b + c} References ---------- * Edwards, A. L. (1948). Note on the "correction for continuity" in testing the significance of the difference between correlated proportions. Psychometrika, 13(3), 185-187. * McNemar, Q. (1947). Note on the sampling error of the difference between correlated proportions or percentages. Psychometrika, 12(2), 153-157. Examples -------- >>> import pingouin as pg >>> data = pg.read_dataset('chi2_mcnemar') >>> observed, stats = pg.chi2_mcnemar(data, 'treatment_X', 'treatment_Y') >>> observed treatment_Y 0 1 treatment_X 0 20 40 1 8 12 In this case, `c` (40) seems to be a significantly greater than `b` (8). The McNemar test should be sensitive to this. >>> stats chi2 dof p-approx p-exact mcnemar 20.020833 1 0.000008 0.000003 """ # Python code initially inspired by statsmodel's mcnemar assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert all(isinstance(column, str) for column in (x, y)),\ 'procedures must contain strings, only.' assert all(column in data.columns for column in (x, y)),\ 'columns are not in dataframe.' for column in (x, y): if data[column].isna().any(): raise ValueError('Null values are not allowed.') observed = dichotomous_crosstab(data, x, y) # Careful, the order of b and c is inverted compared to wikipedia # because the colums / rows of the crosstab is [0, 1] and not [1, 0]. c, b = observed.at[0, 1], observed.at[1, 0] n_discordants = b + c if (b, c) == (0, 0): raise ValueError('McNemar\'s test does not work if the secondary ' + 'diagonal of the observed data summary does not ' + 'have values different from 0.') chi2 = (abs(b - c) - int(correction))**2 / n_discordants pexact = min(1, 2 * binom.cdf(min(b, c), n_discordants, 0.5)) stats = { 'chi2': chi2, 'dof': 1, 'p-approx': sp_chi2.sf(chi2, 1), 'p-exact': pexact, # 'p-mid': pexact - binom.pmf(b, n_discordants, 0.5) } stats = pd.DataFrame(stats, index=['mcnemar']) return observed, _postprocess_dataframe(stats)
def chi2_independence(data, x, y, correction=True): """ Chi-squared independence tests between two categorical variables. The test is computed for different values of :math:`\\lambda`: 1, 2/3, 0, -1/2, -1 and -2 (Cressie and Read, 1984). Parameters ---------- data : :py:class:`pandas.DataFrame` The dataframe containing the ocurrences for the test. x, y : string The variables names for the Chi-squared test. Must be names of columns in ``data``. correction : bool Whether to apply Yates' correction when the degree of freedom of the observed contingency table is 1 (Yates 1934). Returns ------- expected : :py:class:`pandas.DataFrame` The expected contingency table of frequencies. observed : :py:class:`pandas.DataFrame` The (corrected or not) observed contingency table of frequencies. stats : :py:class:`pandas.DataFrame` The test summary, containing four columns: * ``'test'``: The statistic name * ``'lambda'``: The :math:`\\lambda` value used for the power\ divergence statistic * ``'chi2'``: The test statistic * ``'pval'``: The p-value of the test * ``'cramer'``: The Cramer's V effect size * ``'power'``: The statistical power of the test Notes ----- From Wikipedia: *The chi-squared test is used to determine whether there is a significant difference between the expected frequencies and the observed frequencies in one or more categories.* As application examples, this test can be used to *i*) evaluate the quality of a categorical variable in a classification problem or to *ii*) check the similarity between two categorical variables. In the first example, a good categorical predictor and the class column should present high :math:`\\chi^2` and low p-value. In the second example, similar categorical variables should present low :math:`\\chi^2` and high p-value. This function is a wrapper around the :py:func:`scipy.stats.power_divergence` function. .. warning :: As a general guideline for the consistency of this test, the observed and the expected contingency tables should not have cells with frequencies lower than 5. References ---------- * Cressie, N., & Read, T. R. (1984). Multinomial goodness‐of‐fit tests. Journal of the Royal Statistical Society: Series B (Methodological), 46(3), 440-464. * Yates, F. (1934). Contingency Tables Involving Small Numbers and the :math:`\\chi^2` Test. Supplement to the Journal of the Royal Statistical Society, 1, 217-235. Examples -------- Let's see if gender is a good categorical predictor for the presence of heart disease. >>> import pingouin as pg >>> data = pg.read_dataset('chi2_independence') >>> data['sex'].value_counts(ascending=True) 0 96 1 207 Name: sex, dtype: int64 If gender is not a good predictor for heart disease, we should expect the same 96:207 ratio across the target classes. >>> expected, observed, stats = pg.chi2_independence(data, x='sex', ... y='target') >>> expected target 0 1 sex 0 43.722772 52.277228 1 94.277228 112.722772 Let's see what the data tells us. >>> observed target 0 1 sex 0 24.5 71.5 1 113.5 93.5 The proportion is lower on the class 0 and higher on the class 1. The tests should be sensitive to this difference. >>> stats.round(3) test lambda chi2 dof pval cramer power 0 pearson 1.000 22.717 1.0 0.0 0.274 0.997 1 cressie-read 0.667 22.931 1.0 0.0 0.275 0.998 2 log-likelihood 0.000 23.557 1.0 0.0 0.279 0.998 3 freeman-tukey -0.500 24.220 1.0 0.0 0.283 0.998 4 mod-log-likelihood -1.000 25.071 1.0 0.0 0.288 0.999 5 neyman -2.000 27.458 1.0 0.0 0.301 0.999 Very low p-values indeed. The gender qualifies as a good predictor for the presence of heart disease on this dataset. """ # Python code inspired by SciPy's chi2_contingency assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert isinstance(x, str), 'x must be a string.' assert isinstance(y, str), 'y must be a string.' assert all(col in data.columns for col in (x, y)),\ 'columns are not in dataframe.' assert isinstance(correction, bool), 'correction must be a boolean.' observed = pd.crosstab(data[x], data[y]) if observed.size == 0: raise ValueError('No data; observed has size 0.') expected = pd.DataFrame(expected_freq(observed), index=observed.index, columns=observed.columns) # All count frequencies should be at least 5 for df, name in zip([observed, expected], ['observed', 'expected']): if (df < 5).any(axis=None): warnings.warn('Low count on {} frequencies.'.format(name)) dof = float(expected.size - sum(expected.shape) + expected.ndim - 1) if dof == 1 and correction: # Adjust `observed` according to Yates' correction for continuity. observed = observed + 0.5 * np.sign(expected - observed) ddof = observed.size - 1 - dof n = data.shape[0] stats = [] names = [ "pearson", "cressie-read", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman" ] for name, lambda_ in zip(names, [1.0, 2 / 3, 0.0, -1 / 2, -1.0, -2.0]): if dof == 0: chi2, p, cramer, power = 0.0, 1.0, np.nan, np.nan else: chi2, p = power_divergence(observed, expected, ddof=ddof, axis=None, lambda_=lambda_) dof_cramer = min(expected.shape) - 1 cramer = np.sqrt(chi2 / (n * dof_cramer)) power = power_chi2(dof=dof, w=cramer, n=n, alpha=0.05) stats.append({ 'test': name, 'lambda': lambda_, 'chi2': chi2, 'dof': dof, 'pval': p, 'cramer': cramer, 'power': power }) stats = pd.DataFrame(stats)[[ 'test', 'lambda', 'chi2', 'dof', 'pval', 'cramer', 'power' ]] return expected, observed, _postprocess_dataframe(stats)