Python compute_esci 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pingouin.effsize

메소드/함수: compute_esci

hotexamples.com에서의 예제들: 11

Python compute_esci - 11개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pingouin.effsize.compute_esci에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: test_effsize.py 프로젝트: zzy17667036/pingouin

    def test_compute_esci(self):
        """Test function compute_esci.

        Note that since Pingouin v0.3.5, CIs around a Cohen d are calculated
        using a T (and not Z) distribution. This is the same behavior as the
        cohen.d function of the effsize R package.

        However, note that the cohen.d function does not use the Cohen d-avg
        for paired samples, and therefore we cannot directly compare the CIs
        for paired samples. Similarly, R uses a slightly different formula to
        estimate the SE of one-sample cohen D.
        """
        # Pearson correlation
        r = 0.5543563
        ci = compute_esci(stat=r, nx=6, eftype='r')
        assert np.allclose(ci, [-0.47, 0.94])
        # Cohen d
        # .. One sample and paired
        # Cannot compare to R because cohen.d uses different formulas for
        # Cohen d and SE.
        d = compute_effsize(np.r_[x, y], y=0)
        assert round(d, 6) == 2.086694  # Same as cohen.d
        ci = compute_esci(d, nx + ny, 1, decimals=6)
        d = compute_effsize(x, y, paired=True)
        ci = compute_esci(d, nx, ny, paired=True, decimals=6)
        # .. Independent (compare with cohen.d function)
        d = compute_effsize(x, y)
        ci = compute_esci(d, nx, ny, decimals=6)
        np.testing.assert_equal(ci, [-1.067645, 0.226762])
        # Same but with different n
        d = compute_effsize(x, y[:-5])
        ci = compute_esci(d, nx, len(y[:-5]), decimals=8)
        np.testing.assert_equal(ci, [-1.33603010, 0.08662825])

예제 #2

파일 보기

    def test_compute_esci(self):
        """Test function compute_esci.

        Note that since Pingouin v0.3.5, CIs around a Cohen d are calculated
        using a T (and not Z) distribution. This is the same behavior as the
        cohen.d function of the effsize R package.

        However, note that the cohen.d function does not use the Cohen d-avg
        for paired samples, and therefore we cannot directly compare the CIs
        for paired samples. Similarly, R uses a slightly different formula to
        estimate the SE of one-sample cohen D.
        """
        # Pearson correlation
        # https://github.com/SurajGupta/r-source/blob/master/src/library/stats/R/cor.test.R
        ci = compute_esci(stat=0.5543563, nx=6, eftype='r', decimals=6)
        assert np.allclose(ci, [-0.4675554, 0.9420809])
        # Alternative == "greater"
        ci = compute_esci(stat=0.8,
                          nx=20,
                          eftype='r',
                          alternative="greater",
                          decimals=6)
        assert np.allclose(ci, [0.6041625, 1])
        ci = compute_esci(stat=-0.2,
                          nx=30,
                          eftype='r',
                          alternative="greater",
                          decimals=6)
        assert np.allclose(ci, [-0.4771478, 1])
        # Alternative == "less"
        ci = compute_esci(stat=-0.8,
                          nx=20,
                          eftype='r',
                          alternative="less",
                          decimals=6)
        assert np.allclose(ci, [-1, -0.6041625])
        ci = compute_esci(stat=0.2,
                          nx=30,
                          eftype='r',
                          alternative="less",
                          decimals=6)
        assert np.allclose(ci, [-1, 0.4771478])

        # Cohen d
        # .. One sample and paired
        # Cannot compare to R because cohen.d uses different formulas for
        # Cohen d and SE.
        d = compute_effsize(np.r_[x, y], y=0)
        assert round(d, 6) == 2.086694  # Same as cohen.d
        ci = compute_esci(d, nx + ny, 1, decimals=6)
        d = compute_effsize(x, y, paired=True)
        ci = compute_esci(d, nx, ny, paired=True, decimals=6)
        # .. Independent (compare with cohen.d function)
        d = compute_effsize(x, y)
        ci = compute_esci(d, nx, ny, decimals=6)
        np.testing.assert_equal(ci, [-1.067645, 0.226762])
        # Same but with different n
        d = compute_effsize(x, y[:-5])
        ci = compute_esci(d, nx, len(y[:-5]), decimals=8)
        np.testing.assert_equal(ci, [-1.33603010, 0.08662825])

예제 #3

파일 보기

파일: test_effsize.py 프로젝트: cyzhangAThit/pingouin

 def test_compute_esci(self):
     """Test function compute_esci"""
     compute_esci(stat=.6, nx=30, eftype='r')
     compute_esci(stat=.4, nx=len(x), ny=len(y), confidence=.99, decimals=4)
     compute_esci(stat=.6, nx=30, ny=30, eftype='cohen')
     # Compare with R
     r, nx = 0.5543563, 6
     ci = compute_esci(stat=r, nx=nx, eftype='r')
     assert np.allclose(ci, [-0.47, 0.94])
     # One sample / paired T-test
     ci = compute_esci(-0.57932, nx=40, ny=1)
     ci_p = compute_esci(-0.57932, nx=40, ny=1, paired=True)
     assert np.allclose(ci, ci_p)
     assert np.allclose(ci, [-0.91, -0.24])

예제 #4

파일 보기

 def test_compute_esci(self):
     """Test function compute_esci"""
     compute_esci(stat=.6, nx=30, ny=30, eftype='r')
     compute_esci(stat=.4, nx=len(x), ny=len(y), confidence=.99, decimals=4)
     compute_esci(stat=.6, nx=30, ny=30, eftype='cohen')
     # Compare with R
     r, nx, ny = 0.5543563, 6, 6
     ci = compute_esci(stat=r, nx=nx, ny=ny, eftype='r')
     assert np.allclose(ci, [-0.47, 0.94])

예제 #5

파일 보기

파일: correlation.py 프로젝트: AnnaTruzzi/pingouin

def corr(x, y, tail='two-sided', method='pearson'):
    """(Robust) correlation between two variables.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. x and y must be independent.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.
    method : string
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::

        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'kendall' : Kendall’s tau (ordinal data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)

    Returns
    -------
    stats : pandas DataFrame
        Test summary ::

        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'p-val' : one or two tailed p-value
        'BF10' : Bayes Factor of the alternative hypothesis (Pearson only)
        'power' : achieved power of the test (= 1 - type II error).

    See also
    --------
    pairwise_corr : Pairwise correlation between columns of a pandas DataFrame
    partial_corr : Partial correlation

    Notes
    -----
    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Correlations of -1 or +1 imply
    an exact linear relationship.

    The Spearman correlation is a nonparametric measure of the monotonicity of
    the relationship between two datasets. Unlike the Pearson correlation,
    the Spearman correlation does not assume that both datasets are normally
    distributed. Correlations of -1 or +1 imply an exact monotonic
    relationship.

    Kendall’s tau is a measure of the correspondence between two rankings.
    Values close to 1 indicate strong agreement, values close to -1 indicate
    strong disagreement.

    The percentage bend correlation [1]_ is a robust method that
    protects against univariate outliers.

    The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust
    methods that returns the Spearman's rho after bivariate outliers removal.
    Note that the skipped correlation requires that the scikit-learn
    package is installed (for computing the minimum covariance determinant).

    Please note that rows with NaN are automatically removed.

    If ``method='pearson'``, The Bayes Factor is calculated using the
    :py:func:`pingouin.bayesfactor_pearson` function.

    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395

    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200

    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119

    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606

    Examples
    --------
    1. Pearson correlation

    >>> import numpy as np
    >>> # Generate random correlated samples
    >>> np.random.seed(123)
    >>> mean, cov = [4, 6], [(1, .5), (.5, 1)]
    >>> x, y = np.random.multivariate_normal(mean, cov, 30).T
    >>> # Compute Pearson correlation
    >>> from pingouin import corr
    >>> corr(x, y)
              n      r         CI95%     r2  adj_r2     p-val  BF10  power
    pearson  30  0.491  [0.16, 0.72]  0.242   0.185  0.005813  8.55  0.809

    2. Pearson correlation with two outliers

    >>> x[3], y[5] = 12, -8
    >>> corr(x, y)
              n      r          CI95%     r2  adj_r2     p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439148  0.302  0.121

    3. Spearman correlation

    >>> corr(x, y, method="spearman")
               n      r         CI95%     r2  adj_r2     p-val  power
    spearman  30  0.401  [0.05, 0.67]  0.161   0.099  0.028034   0.61

    4. Percentage bend correlation (robust)

    >>> corr(x, y, method='percbend')
               n      r         CI95%     r2  adj_r2     p-val  power
    percbend  30  0.389  [0.03, 0.66]  0.151   0.089  0.033508  0.581

    5. Shepherd's pi correlation (robust)

    >>> corr(x, y, method='shepherd')
               n  outliers      r         CI95%     r2  adj_r2     p-val  power
    shepherd  30         2  0.437  [0.09, 0.69]  0.191   0.131  0.020128  0.694

    6. Skipped spearman correlation (robust)

    >>> corr(x, y, method='skipped')
              n  outliers      r         CI95%     r2  adj_r2     p-val  power
    skipped  30         2  0.437  [0.09, 0.69]  0.191   0.131  0.020128  0.694

    7. One-tailed Pearson correlation

    >>> corr(x, y, tail="one-sided", method='pearson')
              n      r          CI95%     r2  adj_r2     p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.219574  0.467  0.194

    8. Using columns of a pandas dataframe

    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': x, 'y': y})
    >>> corr(data['x'], data['y'])
              n      r          CI95%     r2  adj_r2     p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439148  0.302  0.121
    """
    x = np.asarray(x)
    y = np.asarray(y)

    # Check size
    if x.size != y.size:
        raise ValueError('x and y must have the same length.')

    # Remove NA
    x, y = remove_na(x, y, paired=True)
    nx = x.size

    # Compute correlation coefficient
    if method == 'pearson':
        r, pval = pearsonr(x, y)
    elif method == 'spearman':
        r, pval = spearmanr(x, y)
    elif method == 'kendall':
        r, pval = kendalltau(x, y)
    elif method == 'percbend':
        r, pval = percbend(x, y)
    elif method == 'shepherd':
        r, pval, outliers = shepherd(x, y)
    elif method == 'skipped':
        r, pval, outliers = skipped(x, y, method='spearman')
    else:
        raise ValueError('Method not recognized.')

    assert not np.isnan(r), 'Correlation returned NaN. Check your data.'

    # Compute r2 and adj_r2
    r2 = r**2
    adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3))

    # Compute the parametric 95% confidence interval and power
    if r2 < 1:
        ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r')
        pr = round(power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail), 3)
    else:
        ci = [1., 1.]
        pr = np.inf

    # Create dictionnary
    stats = {
        'n': nx,
        'r': round(r, 3),
        'r2': round(r2, 3),
        'adj_r2': round(adj_r2, 3),
        'CI95%': [ci],
        'p-val': pval if tail == 'two-sided' else .5 * pval,
        'power': pr
    }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = sum(outliers)

    # Compute the BF10 for Pearson correlation only
    if method == 'pearson':
        if r2 < 1:
            stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail)
        else:
            stats['BF10'] = str(np.inf)

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = [
        'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power'
    ]
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return stats[col_order]

예제 #6

파일 보기

파일: correlation.py 프로젝트: AnnaTruzzi/pingouin

def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'):
    """Repeated measures correlation.

    Parameters
    ----------
    data : pd.DataFrame
        Dataframe.
    x, y : string
        Name of columns in ``data`` containing the two dependent variables.
    subject : string
        Name of column in ``data`` containing the subject indicator.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.

    Returns
    -------
    stats : pandas DataFrame
        Test summary ::

        'r' : Repeated measures correlation coefficient
        'dof' : Degrees of freedom
        'pval' : one or two tailed p-value
        'CI95' : 95% parametric confidence intervals
        'power' : achieved power of the test (= 1 - type II error).

    Notes
    -----
    Repeated measures correlation (rmcorr) is a statistical technique
    for determining the common within-individual association for paired
    measures assessed on two or more occasions for multiple individuals.

    From Bakdash and Marusich (2017):

        "Rmcorr accounts for non-independence among observations using analysis
        of covariance (ANCOVA) to statistically adjust for inter-individual
        variability. By removing measured variance between-participants,
        rmcorr provides the best linear fit for each participant using parallel
        regression lines (the same slope) with varying intercepts.
        Like a Pearson correlation coefficient, the rmcorr coefficient
        is bounded by − 1 to 1 and represents the strength of the linear
        association between two variables."

    Results have been tested against the `rmcorr` R package.

    Please note that NaN are automatically removed from the dataframe
    (listwise deletion).

    References
    ----------
    .. [1] Bakdash, J.Z., Marusich, L.R., 2017. Repeated Measures Correlation.
           Front. Psychol. 8, 456. https://doi.org/10.3389/fpsyg.2017.00456

    .. [2] Bland, J. M., & Altman, D. G. (1995). Statistics notes: Calculating
           correlation coefficients with repeated observations:
           Part 1—correlation within subjects. Bmj, 310(6977), 446.

    .. [3] https://github.com/cran/rmcorr

    Examples
    --------
    >>> import pingouin as pg
    >>> df = pg.read_dataset('rm_corr')
    >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
                 r  dof      pval           CI95%  power
    rm_corr -0.507   38  0.000847  [-0.71, -0.23]   0.93
    """
    from pingouin import ancova, power_corr
    # Safety checks
    assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame'
    assert x in data.columns, 'The %s column is not in data.' % x
    assert y in data.columns, 'The %s column is not in data.' % y
    assert data[x].dtype.kind in 'bfi', '%s must be numeric.' % x
    assert data[y].dtype.kind in 'bfi', '%s must be numeric.' % y
    assert subject in data.columns, 'The %s column is not in data.' % subject
    if data[subject].nunique() < 3:
        raise ValueError('rm_corr requires at least 3 unique subjects.')

    # Remove missing values
    data = data[[x, y, subject]].dropna(axis=0)

    # Using PINGOUIN
    aov = ancova(dv=y, covar=x, between=subject, data=data)
    bw = aov.bw_  # Beta within parameter
    sign = np.sign(bw)
    dof = int(aov.at[2, 'DF'])
    n = dof + 2
    ssfactor = aov.at[1, 'SS']
    sserror = aov.at[2, 'SS']
    rm = sign * np.sqrt(ssfactor / (ssfactor + sserror))
    pval = aov.at[1, 'p-unc']
    pval = pval * 0.5 if tail == 'one-sided' else pval
    ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist()
    pwr = power_corr(r=rm, n=n, tail=tail)
    # Convert to Dataframe
    stats = pd.DataFrame(
        {
            "r": round(rm, 3),
            "dof": int(dof),
            "pval": pval,
            "CI95%": str(ci),
            "power": round(pwr, 3)
        },
        index=["rm_corr"])
    return stats

예제 #7

파일 보기

파일: correlation.py 프로젝트: snijesh/pingouin

def corr(x, y, tail='two-sided', method='pearson'):
    """(Robust) correlation between two variables.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. ``x`` and ``y`` must be
        independent.
    tail : string
        Specify whether to return ``'one-sided'`` or ``'two-sided'`` p-value.
        Note that the former are simply half the latter.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\\tau` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'outliers'``: number of outliers, only if a robust method was used
        * ``'r'``: Correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'r2'``: R-squared (:math:`= r^2`)
        * ``'adj_r2'``: Adjusted R-squared
        * ``'p-val'``: tail of the test
        * ``'BF10'``: Bayes Factor of the alternative hypothesis
          (only for Pearson correlation)
        * ``'power'``: achieved power of the test (= 1 - type II error).

    See also
    --------
    pairwise_corr : Pairwise correlation between columns of a pandas DataFrame
    partial_corr : Partial correlation
    rm_corr : Repeated measures correlation

    Notes
    -----
    The `Pearson correlation coefficient
    <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
    measures the linear relationship between two datasets. Strictly speaking,
    Pearson's correlation requires that each dataset be normally distributed.
    Correlations of -1 or +1 imply a perfect negative and positive linear
    relationship, respectively, with 0 indicating the absence of association.

    .. math::
        r_{xy} = \\frac{\\sum_i(x_i - \\bar{x})(y_i - \\bar{y})}
        {\\sqrt{\\sum_i(x_i - \\bar{x})^2} \\sqrt{\\sum_i(y_i - \\bar{y})^2}}
        = \\frac{\\text{cov}(x, y)}{\\sigma_x \\sigma_y}

    where :math:`\\text{cov}` is the sample covariance and :math:`\\sigma`
    is the sample standard deviation.

    If ``method='pearson'``, The Bayes Factor is calculated using the
    :py:func:`pingouin.bayesfactor_pearson` function.

    The `Spearman correlation coefficient
    <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
    is a non-parametric measure of the monotonicity of the relationship between
    two datasets. Unlike the Pearson correlation, the Spearman correlation does
    not assume that both datasets are normally distributed. Correlations of -1
    or +1 imply an exact negative and positive monotonic relationship,
    respectively. Mathematically, the Spearman correlation coefficient is
    defined as the Pearson correlation coefficient between the
    `rank variables <https://en.wikipedia.org/wiki/Ranking>`_.

    The `Kendall correlation coefficient
    <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
    is a measure of the correspondence between two rankings. Values also range
    from -1 (perfect disagreement) to 1 (perfect agreement), with 0 indicating
    the absence of association. Consistent with
    :py:func:`scipy.stats.kendalltau`, Pingouin returns the Tau-b coefficient,
    which adjusts for ties:

    .. math:: \\tau_B = \\frac{(P - Q)}{\\sqrt{(P + Q + T) (P + Q + U)}}

    where :math:`P` is the number of concordant pairs, :math:`Q` the number of
    discordand pairs, :math:`T` the number of ties in x, and :math:`U`
    the number of ties in y.

    The `biweight midcorrelation
    <https://en.wikipedia.org/wiki/Biweight_midcorrelation>`_ and
    percentage bend correlation [1]_ are both robust methods that
    protects against *univariate* outliers by down-weighting observations that
    deviate too much from the median.

    The Shepherd pi [2]_ correlation and skipped [3]_, [4]_ correlation are
    both robust methods that returns the Spearman correlation coefficient after
    removing *bivariate* outliers. Briefly, the Shepherd pi uses a
    bootstrapping of the Mahalanobis distance to identify outliers, while the
    skipped correlation is based on the minimum covariance determinant
    (which requires scikit-learn). Note that these two methods are
    significantly slower than the previous ones.

    .. important:: Please note that rows with missing values (NaN) are
        automatically removed.

    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395

    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200

    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119

    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606

    Examples
    --------
    1. Pearson correlation

    >>> import numpy as np
    >>> import pingouin as pg
    >>> # Generate random correlated samples
    >>> np.random.seed(123)
    >>> mean, cov = [4, 6], [(1, .5), (.5, 1)]
    >>> x, y = np.random.multivariate_normal(mean, cov, 30).T
    >>> # Compute Pearson correlation
    >>> pg.corr(x, y).round(3)
              n      r         CI95%     r2  adj_r2  p-val  BF10  power
    pearson  30  0.491  [0.16, 0.72]  0.242   0.185  0.006  8.55  0.809

    2. Pearson correlation with two outliers

    >>> x[3], y[5] = 12, -8
    >>> pg.corr(x, y).round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.302  0.121

    3. Spearman correlation (robust to outliers)

    >>> pg.corr(x, y, method="spearman").round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    spearman  30  0.401  [0.05, 0.67]  0.161   0.099  0.028   0.61

    4. Biweight midcorrelation (robust)

    >>> pg.corr(x, y, method="bicor").round(3)
            n      r         CI95%     r2  adj_r2  p-val  power
    bicor  30  0.393  [0.04, 0.66]  0.155   0.092  0.031  0.592

    5. Percentage bend correlation (robust)

    >>> pg.corr(x, y, method='percbend').round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    percbend  30  0.389  [0.03, 0.66]  0.151   0.089  0.034  0.581

    6. Shepherd's pi correlation (robust)

    >>> pg.corr(x, y, method='shepherd').round(3)
               n  outliers      r         CI95%     r2  adj_r2  p-val  power
    shepherd  30         2  0.437  [0.09, 0.69]  0.191   0.131   0.02  0.694

    7. Skipped spearman correlation (robust)

    >>> pg.corr(x, y, method='skipped').round(3)
              n  outliers      r         CI95%     r2  adj_r2  p-val  power
    skipped  30         2  0.437  [0.09, 0.69]  0.191   0.131   0.02  0.694

    8. One-tailed Pearson correlation

    >>> pg.corr(x, y, tail="one-sided", method='pearson').round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051   0.22  0.467  0.194

    9. Using columns of a pandas dataframe

    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': x, 'y': y})
    >>> pg.corr(data['x'], data['y']).round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.302  0.121
    """
    # Safety check
    x = np.asarray(x)
    y = np.asarray(y)
    assert x.ndim == y.ndim == 1, 'x and y must be 1D array.'
    assert x.size == y.size, 'x and y must have the same length.'

    # Remove rows with missing values
    x, y = remove_na(x, y, paired=True)
    nx = x.size

    # Compute correlation coefficient
    if method == 'pearson':
        r, pval = pearsonr(x, y)
    elif method == 'spearman':
        r, pval = spearmanr(x, y)
    elif method == 'kendall':
        r, pval = kendalltau(x, y)
    elif method == 'bicor':
        r, pval = bicor(x, y)
    elif method == 'percbend':
        r, pval = percbend(x, y)
    elif method == 'shepherd':
        r, pval, outliers = shepherd(x, y)
    elif method == 'skipped':
        r, pval, outliers = skipped(x, y)
    else:
        raise ValueError('Method not recognized.')

    if np.isnan(r):
        # Correlation failed -- new in version v0.3.4, instead of raising an
        # error we just return a dataframe full of NaN (except sample size).
        # This avoid sudden stop in pingouin.pairwise_corr.
        return pd.DataFrame(
            {
                'n': nx,
                'r': np.nan,
                'CI95%': np.nan,
                'r2': np.nan,
                'adj_r2': np.nan,
                'p-val': np.nan,
                'BF10': np.nan,
                'power': np.nan
            },
            index=[method])

    # Compute r2 and adj_r2
    r2 = r**2
    adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3))

    # Compute the parametric 95% confidence interval and power
    ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r')
    pr = power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail),

    # Create dictionnary
    stats = {
        'n': nx,
        'r': r,
        'r2': r2,
        'adj_r2': adj_r2,
        'CI95%': [ci],
        'p-val': pval if tail == 'two-sided' else .5 * pval,
        'power': pr
    }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = sum(outliers)

    # Compute the BF10 for Pearson correlation only
    if method == 'pearson':
        stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail)

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = [
        'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power'
    ]
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return stats[col_order]

예제 #8

파일 보기

def corr(x, y, tail='two-sided', method='pearson'):
    """(Robust) correlation between two variables.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. x and y must be independent.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.
    method : string
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::

        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'kendall' : Kendall’s tau (ordinal data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)

    Returns
    -------
    stats : pandas DataFrame
        Test summary ::

        'n' : Sample size (after NaN removal)
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'p-val' : one or two tailed p-value
        'BF10' : Bayes Factor of the alternative hypothesis (Pearson only)

    Notes
    -----
    The Pearson correlation coefficient measures the linear relationship
    between two datasets. Strictly speaking, Pearson's correlation requires
    that each dataset be normally distributed. Correlations of -1 or +1 imply
    an exact linear relationship.

    The Spearman correlation is a nonparametric measure of the monotonicity of
    the relationship between two datasets. Unlike the Pearson correlation,
    the Spearman correlation does not assume that both datasets are normally
    distributed. Correlations of -1 or +1 imply an exact monotonic
    relationship.

    Kendall’s tau is a measure of the correspondence between two rankings.
    Values close to 1 indicate strong agreement, values close to -1 indicate
    strong disagreement.

    The percentage bend correlation [1]_ is a robust method that
    protects against univariate outliers.

    The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust
    methods that returns the Spearman's rho after bivariate outliers removal.
    Note that the skipped correlation requires that the scikit-learn
    package is installed (for computing the minimum covariance determinant).

    Please note that rows with NaN are automatically removed.

    If method='pearson', The JZS Bayes Factor is approximated using the
    formula described in ref [5]_:

    .. math::

        BF_{10} = \dfrac{\sqrt{n/2}}{\gamma(1/2)}*\int_{0}^{\infty}e((n-2)/2)*
        log(1+g)+(-(n-1)/2)log(1+(1-r^2)*g)+(-3/2)log(g)-n/2g

    where **n** is the sample size and **r** is the Pearson correlation
    coefficient.


    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395

    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200

    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119

    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606

    .. [5] Wetzels, R., Wagenmakers, E.-J., 2012. A default Bayesian
       hypothesis test for correlations and partial correlations.
       Psychon. Bull. Rev. 19, 1057–1064.
       https://doi.org/10.3758/s13423-012-0295-x


    Examples
    --------
    1. Pearson correlation

        >>> # Generate random correlated samples
        >>> np.random.seed(123)
        >>> mean, cov = [4, 6], [(1, .5), (.5, 1)]
        >>> x, y = np.random.multivariate_normal(mean, cov, 30).T
        >>> # Compute Pearson correlation
        >>> from pingouin import corr
        >>> corr(x, y)
            method   n   r      CI95%         r2     adj_r2  p-val   BF10
            pearson  30  0.491  [0.16, 0.72]  0.242  0.185   0.0058  6.135

    2. Pearson correlation with two outliers

        >>> x[3], y[5] = 12, -8
        >>> corr(x, y)
            method   n    r      CI95%          r2     adj_r2  p-val  BF10
            pearson  30   0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.19

    3. Spearman correlation

        >>> corr(x, y, method="spearman")
            method    n   r      CI95%         r2     adj_r2  p-val
            spearman  30  0.401  [0.05, 0.67]  0.161  0.099   0.028

    4. Percentage bend correlation (robust)

        >>> corr(x, y, method='percbend')
            method    n   r      CI95%         r2     adj_r2  p-val
            percbend  30  0.389  [0.03, 0.66]  0.151  0.089   0.034

    5. Shepherd's pi correlation (robust)

        >>> corr(x, y, method='shepherd')
            method    n   r      CI95%         r2     adj_r2  p-val
            percbend  30  0.437  [0.09, 0.69]  0.191  0.131   0.020

    6. Skipped spearman correlation (robust)

        >>> corr(x, y, method='skipped')
            method    n   r      CI95%         r2     adj_r2  p-val
            percbend  30  0.437  [0.09, 0.69]  0.191  0.131   0.020

    7. One-tailed Spearman correlation

        >>> corr(x, y, tail="one-sided", method='shepherd')
            method    n   r      CI95%         r2     adj_r2  p-val
            spearman  30  0.401  [0.05, 0.67]  0.161  0.099   0.014

    8. Using columns of a pandas dataframe

        >>> import pandas as pd
        >>> data = pd.DataFrame({'x': x, 'y': y})
        >>> corr(data['x'], data['y'])
            method   n    r      CI95%          r2     adj_r2  p-val  BF10
            pearson  30   0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.19
    """
    x = np.asarray(x)
    y = np.asarray(y)

    # Check size
    if x.size != y.size:
        raise ValueError('x and y must have the same length.')

    # Remove NA
    x, y = _remove_na(x, y, paired=True)
    nx = x.size

    # Compute correlation coefficient
    if method == 'pearson':
        r, pval = pearsonr(x, y)
    elif method == 'spearman':
        r, pval = spearmanr(x, y)
    elif method == 'kendall':
        r, pval = kendalltau(x, y)
    elif method == 'percbend':
        r, pval = percbend(x, y)
    elif method == 'shepherd':
        r, pval = shepherd(x, y)
    elif method == 'skipped':
        r, pval, _ = skipped(x, y, method='spearman')
    else:
        raise ValueError('Method not recognized.')

    # Compute adj_r2
    adj_r2 = 1 - (((1 - r**2) * (nx - 1)) / (nx - 3))

    # Compute the parametric 95% confidence interval
    ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r')

    stats = pd.DataFrame({}, index=[method])
    stats['n'] = nx
    stats['r'] = np.round(r, 3)
    stats['CI95%'] = [ci]
    stats['r2'] = np.round(r**2, 3)
    stats['adj_r2'] = np.round(adj_r2, 3)
    stats['p-val'] = pval if tail == 'two-sided' else .5 * pval

    # Compute the BF10 for Pearson correlation only
    if method == 'pearson' and nx < 1000:
        stats['BF10'] = bayesfactor_pearson(r, nx)

    col_order = ['n', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10']
    stats = stats.reindex(columns=col_order)
    stats.dropna(how='all', axis=1, inplace=True)
    return stats

예제 #9

파일 보기

def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'):
    """Repeated measures correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Dataframe.
    x, y : string
        Name of columns in ``data`` containing the two dependent variables.
    subject : string
        Name of column in ``data`` containing the subject indicator.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'r'``: Repeated measures correlation coefficient
        * ``'dof'``: Degrees of freedom
        * ``'pval'``: one or two tailed p-value
        * ``'CI95'``: 95% parametric confidence intervals
        * ``'power'``: achieved power of the test (= 1 - type II error).

    See also
    --------
    plot_rm_corr

    Notes
    -----
    Repeated measures correlation (rmcorr) is a statistical technique
    for determining the common within-individual association for paired
    measures assessed on two or more occasions for multiple individuals.

    From `Bakdash and Marusich (2017)
    <https://doi.org/10.3389/fpsyg.2017.00456>`_:

        *Rmcorr accounts for non-independence among observations using analysis
        of covariance (ANCOVA) to statistically adjust for inter-individual
        variability. By removing measured variance between-participants,
        rmcorr provides the best linear fit for each participant using parallel
        regression lines (the same slope) with varying intercepts.
        Like a Pearson correlation coefficient, the rmcorr coefficient
        is bounded by − 1 to 1 and represents the strength of the linear
        association between two variables.*

    Results have been tested against the
    `rmcorr <https://github.com/cran/rmcorr>`_ R package.

    Please note that missing values are automatically removed from the
    dataframe (listwise deletion).

    Examples
    --------
    >>> import pingouin as pg
    >>> df = pg.read_dataset('rm_corr')
    >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
                   r  dof      pval           CI95%     power
    rm_corr -0.50677   38  0.000847  [-0.71, -0.23]  0.929579

    Now plot using the :py:func:`pingouin.plot_rm_corr` function:

    .. plot::

        >>> import pingouin as pg
        >>> df = pg.read_dataset('rm_corr')
        >>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
    """
    from pingouin import ancova, power_corr
    # Safety checks
    assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame'
    assert x in data.columns, 'The %s column is not in data.' % x
    assert y in data.columns, 'The %s column is not in data.' % y
    assert data[x].dtype.kind in 'bfiu', '%s must be numeric.' % x
    assert data[y].dtype.kind in 'bfiu', '%s must be numeric.' % y
    assert subject in data.columns, 'The %s column is not in data.' % subject
    if data[subject].nunique() < 3:
        raise ValueError('rm_corr requires at least 3 unique subjects.')

    # Remove missing values
    data = data[[x, y, subject]].dropna(axis=0)

    # Using PINGOUIN
    # For max precision, make sure rounding is disabled
    old_options = options.copy()
    options['round'] = None
    aov = ancova(dv=y, covar=x, between=subject, data=data)
    options.update(old_options)  # restore options
    bw = aov.bw_  # Beta within parameter
    sign = np.sign(bw)
    dof = int(aov.at[2, 'DF'])
    n = dof + 2
    ssfactor = aov.at[1, 'SS']
    sserror = aov.at[2, 'SS']
    rm = sign * np.sqrt(ssfactor / (ssfactor + sserror))
    pval = aov.at[1, 'p-unc']
    pval = pval * 0.5 if tail == 'one-sided' else pval
    ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist()
    pwr = power_corr(r=rm, n=n, tail=tail)
    # Convert to Dataframe
    stats = pd.DataFrame({"r": rm,
                          "dof": int(dof),
                          "pval": pval,
                          "CI95%": [ci],
                          "power": pwr}, index=["rm_corr"])
    return _postprocess_dataframe(stats)

예제 #10

파일 보기

파일: correlation.py 프로젝트: raphaelvallat/pingouin

def partial_corr(data=None,
                 x=None,
                 y=None,
                 covar=None,
                 x_covar=None,
                 y_covar=None,
                 alternative='two-sided',
                 method='pearson'):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Pandas Dataframe. Note that this function can also directly be used
        as a :py:class:`pandas.DataFrame` method, in which case this argument
        is no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    alternative : string
        Defines the alternative hypothesis, or tail of the partial correlation. Must be one of
        "two-sided" (default), "greater" or "less". Both "greater" and "less" return a one-sided
        p-value. "greater" tests against the alternative hypothesis that the partial correlation is
        positive (greater than zero), "less" tests against the hypothesis that the partial
        correlation is negative.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'r'``: Partial correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'p-val'``: p-value

    See also
    --------
    corr, pcorr, pairwise_corr, rm_corr

    Notes
    -----
    Partial correlation [1]_ measures the degree of association between ``x``
    and ``y``, after removing the effect of one or more controlling variables
    (``covar``, or :math:`Z`). Practically, this is achieved by calculating the
    correlation coefficient between the residuals of two linear regressions:

    .. math:: x \\sim Z, y \\sim Z

    Like the correlation coefficient, the partial correlation
    coefficient takes on a value in the range from –1 to 1, where 1 indicates a
    perfect positive association.

    The semipartial correlation is similar to the partial correlation,
    with the exception that the set of controlling variables is only
    removed for either ``x`` or ``y``, but not both.

    Pingouin uses the method described in [2]_ to calculate the (semi)partial
    correlation coefficients and associated p-values. This method is based on
    the inverse covariance matrix and is significantly faster than the
    traditional regression-based method. Results have been tested against the
    `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_
    R package.

    .. important:: Rows with missing values are automatically removed from
        data.

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Partial_correlation

    .. [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681537/

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3)
              n      r         CI95%  p-val
    pearson  30  0.568  [0.25, 0.77]  0.001

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.521  [0.18, 0.75]  0.005

    3. Same but one-sided test

    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 alternative="greater", method='spearman').round(3)
               n      r        CI95%  p-val
    spearman  30  0.521  [0.24, 1.0]  0.003

    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 alternative="less", method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.521  [-1.0, 0.72]  0.997

    4. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'], method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.578  [0.27, 0.78]  0.001

    5. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    6. Semi-partial correlation on x

    >>> pg.partial_corr(data=df, x='x', y='y', x_covar=['cv1', 'cv2', 'cv3']).round(3)
              n      r        CI95%  p-val
    pearson  30  0.463  [0.1, 0.72]  0.015
    """
    from pingouin.utils import _flatten_list
    # Safety check
    assert alternative in [
        'two-sided', 'greater', 'less'
    ], ("Alternative must be one of 'two-sided' (default), 'greater' or 'less'."
        )
    assert method in [
        'pearson', 'spearman'
    ], ('only "pearson" and "spearman" are supported for partial correlation.')
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    if x_covar is not None and y_covar is not None:
        raise ValueError('Cannot specify both x_covar and y_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    if isinstance(covar, list):
        assert x not in covar, 'x and covar must be independent'
        assert y not in covar, 'y and covar must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    n = data.shape[0]  # Number of samples
    k = data.shape[1] - 2  # Number of covariates
    assert n > 2, 'Data must have at least 3 non-NAN samples.'

    # Calculate the partial corrrelation matrix - similar to pingouin.pcorr()
    if method == "spearman":
        # Convert the data to rank, similar to R cov()
        V = data.rank(na_option='keep').cov()
    else:
        V = data.cov()
    Vi = np.linalg.pinv(V, hermitian=True)  # Inverse covariance matrix
    Vi_diag = Vi.diagonal()
    D = np.diag(np.sqrt(1 / Vi_diag))
    pcor = -1 * (D @ Vi @ D)  # Partial correlation matrix

    if covar is not None:
        r = pcor[0, 1]
    else:
        # Semi-partial correlation matrix
        with np.errstate(divide='ignore'):
            spcor = pcor / \
                np.sqrt(np.diag(V))[..., None] / \
                np.sqrt(np.abs(Vi_diag - Vi ** 2 / Vi_diag[..., None])).T
        if y_covar is not None:
            r = spcor[0, 1]  # y_covar is removed from y
        else:
            r = spcor[1, 0]  # x_covar is removed from x

    if np.isnan(r):
        # Correlation failed. Return NaN. When would this happen?
        return pd.DataFrame(
            {
                'n': n,
                'r': np.nan,
                'CI95%': np.nan,
                'p-val': np.nan
            },
            index=[method])

    # Compute the two-sided p-value and confidence intervals
    # https://online.stat.psu.edu/stat505/lesson/6/6.3
    pval = _correl_pvalue(r, n, k, alternative)
    ci = compute_esci(stat=r,
                      nx=(n - k),
                      ny=(n - k),
                      eftype='r',
                      decimals=6,
                      alternative=alternative)

    # Create dictionnary
    stats = {
        'n': n,
        'r': r,
        'CI95%': [ci],
        'p-val': pval,
    }

    # Convert to DataFrame
    stats = pd.DataFrame(stats, index=[method])

    # Define order
    col_keep = ['n', 'r', 'CI95%', 'p-val']
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return _postprocess_dataframe(stats)[col_order]

예제 #11

파일 보기

파일: correlation.py 프로젝트: vishalbelsare/pingouin

def partial_corr(data=None, x=None, y=None, covar=None, x_covar=None,
                 y_covar=None, tail='two-sided', method='pearson', **kwargs):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Panddas Dataframe. Note that this function can also directly be used
        as a :py:class:`pandas.DataFrame` method, in which case this argument
        is no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    tail : string
        Specify whether to return `'one-sided'` or `'two-sided'` p-value.
        The former are simply half the latter.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\\tau_B` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)
    **kwargs : optional
        Optional argument(s) passed to the lower-level correlation functions.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'outliers'``: number of outliers, only if a robust method was used
        * ``'r'``: Correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'p-val'``: tail of the test

    See also
    --------
    corr, pairwise_corr, rm_corr

    Notes
    -----
    From [1]_:

        *With partial correlation, we find the correlation between x
        and y holding C constant for both x and
        y. Sometimes, however, we want to hold C constant for
        just x or just y. In that case, we compute a
        semi-partial correlation. A partial correlation is computed between
        two residuals. A semi-partial correlation is computed between one
        residual and another raw (or unresidualized) variable.*

    Note that if you are not interested in calculating the p-values [2]_
    but only the partial correlation matrix, a faster
    alternative is to use :py:func:`pingouin.pcorr` (see example 4).

    Rows with missing values are automatically removed from data. Results have
    been tested against the
    `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_
    R package.

    References
    ----------
    .. [1] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html

    .. [2] https://online.stat.psu.edu/stat505/lesson/6/6.3

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3)
              n      r         CI95%  p-val
    pearson  30  0.568  [0.25, 0.77]  0.001

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.491  [0.14, 0.73]  0.009

    3. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.568  [0.26, 0.77]  0.001

    4. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    5. Semi-partial correlation on x

    >>> pg.partial_corr(data=df, x='x', y='y',
    ...                 x_covar=['cv1', 'cv2', 'cv3']).round(3)
              n      r        CI95%  p-val
    pearson  30  0.463  [0.1, 0.72]  0.015
    """
    from pingouin.utils import _flatten_list
    # Safety check
    assert tail in ['two-sided', 'one-sided'], (
        'tail must be "two-sided" or "one-sided".')
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    if x_covar is not None and y_covar is not None:
        raise ValueError('Cannot specify both x_covar and y_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    if isinstance(covar, list):
        assert x not in covar, 'x and covar must be independent'
        assert y not in covar, 'y and covar must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]

    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    n = data.shape[0]  # Number of samples
    k = data.shape[1] - 2  # Number of covariates
    # dof = n - k - 2
    assert n > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0)

    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(C[covar].to_numpy())
        beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
        res_x = C[x].to_numpy() - cvar @ beta_x
        res_y = C[y].to_numpy() - cvar @ beta_y
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].to_numpy(), data[y].to_numpy()
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].to_numpy())
            beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
            res_x = C[x].to_numpy() - cvar @ beta_x
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].to_numpy())
            beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
            res_y = C[y].to_numpy() - cvar @ beta_y

    # Compute partial correlation coefficient
    # We do not extract the p-values at this stage because they do not account
    # for the number of covariates in the degrees of freedom
    if method == 'pearson':
        r, _ = pearsonr(res_x, res_y)
    elif method == 'spearman':
        r, _ = spearmanr(res_x, res_y, **kwargs)
    elif method == 'kendall':
        r, _ = kendalltau(res_x, res_y, **kwargs)
    elif method == 'bicor':
        r, _ = bicor(res_x, res_y, **kwargs)
    elif method == 'percbend':
        r, _ = percbend(res_x, res_y, **kwargs)
    elif method == 'shepherd':
        r, _, outliers = shepherd(res_x, res_y, **kwargs)
    elif method == 'skipped':
        r, _, outliers = skipped(res_x, res_y, **kwargs)
    else:
        raise ValueError(f'Method "{method}" not recognized.')

    if np.isnan(r):
        # Correlation failed -- new in version v0.3.4, instead of raising an
        # error we just return a dataframe full of NaN (except sample size).
        # This avoid sudden stop in pingouin.pairwise_corr.
        return pd.DataFrame({'n': n, 'r': np.nan, 'CI95%': np.nan,
                             'p-val': np.nan}, index=[method])

    # Sample size after outlier removal
    n_outliers = sum(outliers) if "outliers" in locals() else 0
    n_clean = n - n_outliers

    # Compute the two-sided p-value and confidence intervals
    # https://online.stat.psu.edu/stat505/lesson/6/6.3
    pval = _correl_pvalue(r, n_clean, k)
    ci = compute_esci(
        stat=r, nx=(n_clean - k), ny=(n_clean - k), eftype='r', decimals=6)

    # Create dictionnary
    stats = {
        'n': n,
        'r': r,
        'CI95%': [ci],
        'p-val': pval if tail == 'two-sided' else .5 * pval,
    }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = n_outliers

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = ['n', 'outliers', 'r', 'CI95%', 'p-val']
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return _postprocess_dataframe(stats)[col_order]