예제 #1
0
def friedman(data=None, dv=None, within=None, subject=None):
    """Friedman test for repeated measurements.

    Parameters
    ----------
    data : pandas DataFrame
        DataFrame
    dv : string
        Name of column containing the dependant variable.
    within : string
        Name of column containing the within-subject factor.
    subject : string
        Name of column containing the subject identifier.

    Returns
    -------
    stats : DataFrame
        Test summary ::

        'Q' : The Friedman Q statistic, corrected for ties
        'p-unc' : Uncorrected p-value
        'dof' : degrees of freedom

    Notes
    -----
    The Friedman test is used for one-way repeated measures ANOVA by ranks.

    Data are expected to be in long-format.

    Note that if the dataset contains one or more other within subject
    factors, an automatic collapsing to the mean is applied on the dependant
    variable (same behavior as the ezANOVA R package). As such, results can
    differ from those of JASP. If you can, always double-check the results.

    Due to the assumption that the test statistic has a chi squared
    distribution, the p-value is only reliable for n > 10 and more than 6
    repeated measurements.

    NaN values are automatically removed.

    Examples
    --------
    Compute the Friedman test for repeated measurements.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject')
                      Source  ddof1      Q     p-unc
    Friedman  Disgustingness      1  9.228  0.002384
    """
    # Check data
    _check_dataframe(dv=dv,
                     within=within,
                     data=data,
                     subject=subject,
                     effects='within')

    # Collapse to the mean
    data = data.groupby([subject, within]).mean().reset_index()

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv,
                            within=within,
                            subject=subject,
                            data=data[[subject, within, dv]])

    # Extract number of groups and total sample size
    grp = data.groupby(within)[dv]
    rm = list(data[within].unique())
    k = len(rm)
    X = np.array([grp.get_group(r).values for r in rm]).T
    n = X.shape[0]

    # Rank per subject
    ranked = np.zeros(X.shape)
    for i in range(n):
        ranked[i] = scipy.stats.rankdata(X[i, :])

    ssbn = (ranked.sum(axis=0)**2).sum()

    # Compute the test statistic
    Q = (12 / (n * k * (k + 1))) * ssbn - 3 * n * (k + 1)

    # Correct for ties
    ties = 0
    for i in range(n):
        replist, repnum = scipy.stats.find_repeats(X[i])
        for t in repnum:
            ties += t * (t * t - 1)

    c = 1 - ties / float(k * (k * k - 1) * n)
    Q /= c

    # Approximate the p-value
    ddof1 = k - 1
    p_unc = scipy.stats.chi2.sf(Q, ddof1)

    # Create output dataframe
    stats = pd.DataFrame(
        {
            'Source': within,
            'ddof1': ddof1,
            'Q': np.round(Q, 3),
            'p-unc': p_unc,
        },
        index=['Friedman'])

    col_order = ['Source', 'ddof1', 'Q', 'p-unc']

    stats = stats.reindex(columns=col_order)
    stats.dropna(how='all', axis=1, inplace=True)

    return stats
예제 #2
0
def cochran(data=None, dv=None, within=None, subject=None):
    """Cochran Q test. Special case of the Friedman test when the dependant
    variable is binary.

    Parameters
    ----------
    data : pandas DataFrame
        DataFrame
    dv : string
        Name of column containing the binary dependant variable.
    within : string
        Name of column containing the within-subject factor.
    subject : string
        Name of column containing the subject identifier.

    Returns
    -------
    stats : DataFrame
        Test summary ::

        'Q' : The Cochran Q statistic
        'p-unc' : Uncorrected p-value
        'dof' : degrees of freedom

    Notes
    -----
    The Cochran Q Test is a non-parametric test for ANOVA with repeated
    measures where the dependent variable is binary.

    Data are expected to be in long-format. NaN are automatically removed
    from the data.

    The Q statistics is defined as:

    .. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2}

    where :math:`N` is the total sum of all observations, :math:`j=1,...,r`
    where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where
    :math:`n` is the number of observations per condition.

    The p-value is then approximated using a chi-square distribution with
    :math:`r-1` degrees of freedom:

    .. math:: Q \\sim \\chi^2(r-1)

    References
    ----------
    .. [1] Cochran, W.G., 1950. The comparison of percentages in matched
       samples. Biometrika 37, 256–266.
       https://doi.org/10.1093/biomet/37.3-4.256

    Examples
    --------
    Compute the Cochran Q test for repeated measurements.

    >>> from pingouin import cochran, read_dataset
    >>> df = read_dataset('cochran')
    >>> cochran(data=df, dv='Energetic', within='Time', subject='Subject')
            Source  dof      Q     p-unc
    cochran   Time    2  6.706  0.034981
    """
    # Check data
    _check_dataframe(dv=dv,
                     within=within,
                     data=data,
                     subject=subject,
                     effects='within')

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv,
                            within=within,
                            subject=subject,
                            data=data[[subject, within, dv]])

    # Groupby and extract size
    grp = data.groupby(within)[dv]
    grp_s = data.groupby(subject)[dv]
    k = data[within].nunique()
    dof = k - 1
    # n = grp.count().unique()[0]

    # Q statistic and p-value
    q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \
        (k * grp.sum().sum() - np.sum(grp_s.sum()**2))
    p_unc = scipy.stats.chi2.sf(q, dof)

    # Create output dataframe
    stats = pd.DataFrame(
        {
            'Source': within,
            'dof': dof,
            'Q': np.round(q, 3),
            'p-unc': p_unc,
        },
        index=['cochran'])

    return stats
예제 #3
0
def friedman(data=None, dv=None, within=None, subject=None, method='chisq'):
    """Friedman test for repeated measurements.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame
    dv : string
        Name of column containing the dependent variable.
    within : string
        Name of column containing the within-subject factor.
    subject : string
        Name of column containing the subject identifier.
    method : string
        Statistical test to perform. Must be ``'chisq'`` (chi-square test) or ``'f'`` (F test).
        See notes below for explanation.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W'``: Kendall's coefficient of concordance, corrected for ties

        If ``method='chisq'``

            * ``'Q'``: The Friedman chi-square statistic, corrected for ties
            * ``'dof'``: degrees of freedom
            * ``'p-unc'``: Uncorrected p-value of the chi squared test


        If ``method='f'``

            * ``'F'``: The Friedman F statistic, corrected for ties
            * ``'dof1'``: degrees of freedom of the numerator
            * ``'dof2'``: degrees of freedom of the denominator
            * ``'p-unc'``: Uncorrected p-value of the F test

    Notes
    -----
    The Friedman test is used for one-way repeated measures ANOVA by ranks.

    Data are expected to be in long-format.

    Note that if the dataset contains one or more other within subject
    factors, an automatic collapsing to the mean is applied on the dependent
    variable (same behavior as the ezANOVA R package). As such, results can
    differ from those of JASP. If you can, always double-check the results.

    NaN values are automatically removed.

    The Friedman test is equivalent to the test of significance of Kendalls's
    coefficient of concordance (Kendall's W). Most commonly a Q statistic,
    which has asymptotical chi-squared distribution, is computed and used for
    testing. However, in [1]_ they showed the chi-squared test to be overly
    conservative for small numbers of samples and repeated measures. Instead
    they recommend the F test, which has the correct size and behaves like a
    permutation test, but is computationaly much easier.

    References
    ----------
    .. [1] Marozzi, M. (2014). Testing for concordance between several
           criteria. Journal of Statistical Computation and Simulation,
           84(9), 1843–1850. https://doi.org/10.1080/00949655.2013.766189

    Examples
    --------
    Compute the Friedman test for repeated measurements.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject')
                      Source         W  ddof1         Q     p-unc
    Friedman  Disgustingness  0.099224      1  9.227848  0.002384


    This time we will use the F test method.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject', method='f')
                      Source         W     ddof1      ddof2         F     p-unc
    Friedman  Disgustingness  0.099224  0.978495  90.021505  10.13418  0.002138

    We can see, compared to the previous example, that the p-value is slightly
    lower. This is expected, since the F test is more powerful (see Notes).
    """
    # Check data
    _check_dataframe(dv=dv, within=within, data=data, subject=subject,
                     effects='within')

    # Convert Categorical columns to string
    # This is important otherwise all the groupby will return different results
    # unless we specify .groupby(..., observed = True).
    for c in [subject, within]:
        if data[c].dtype.name == 'category':
            data[c] = data[c].astype(str)

    # Collapse to the mean
    data = data.groupby([subject, within]).mean().reset_index()

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv, within=within, subject=subject,
                            data=data[[subject, within, dv]])

    # Extract number of groups and total sample size
    grp = data.groupby(within)[dv]
    rm = list(data[within].unique())
    k = len(rm)
    X = np.array([grp.get_group(r).to_numpy() for r in rm]).T
    n = X.shape[0]

    # Rank per subject
    ranked = np.zeros(X.shape)
    for i in range(n):
        ranked[i] = scipy.stats.rankdata(X[i, :])

    ssbn = (ranked.sum(axis=0)**2).sum()

    # Correction for ties
    ties = 0
    for i in range(n):
        replist, repnum = scipy.stats.find_repeats(X[i])
        for t in repnum:
            ties += t * (t * t - 1)

    # Compute Kendall's W corrected for ties
    W = (12 * ssbn - 3 * n * n * k * (k + 1) * (k + 1)) / (n * n * k * (k - 1) * (k + 1) - n * ties)

    if method == 'chisq':
        # Compute the Q statistic
        Q = n * (k - 1) * W

        # Approximate the p-value
        ddof1 = k - 1
        p_unc = scipy.stats.chi2.sf(Q, ddof1)

        # Create output dataframe
        stats = pd.DataFrame({'Source': within,
                              'W': W,
                              'ddof1': ddof1,
                              'Q': Q,
                              'p-unc': p_unc,
                              }, index=['Friedman'])
    elif method == 'f':
        # Compute the F statistic
        F = W * (n - 1) / (1 - W)

        # Approximate the p-value
        ddof1 = k - 1 - 2 / n
        ddof2 = (n - 1) * ddof1
        p_unc = scipy.stats.f.sf(F, ddof1, ddof2)

        # Create output dataframe
        stats = pd.DataFrame({'Source': within,
                              'W': W,
                              'ddof1': ddof1,
                              'ddof2': ddof2,
                              'F': F,
                              'p-unc': p_unc,
                              }, index=['Friedman'])

    return _postprocess_dataframe(stats)