示例#1
0
def build_partial_corr(corr_df, target, covar, method='pearson', padjust='fdr_bh', pval=0.05, covar_name=None):
    """
    Builds partial correlation DataFrame from corr_df of the target survey, controlling for covar.
    
    corr_df (pd.DataFrame): correlation frame, assuming each row is an observation
    target (str): targe column, can be a string prefix or suffix
    covar (list): a list of covariates to control for
    covar_name (str): optional name for covariates in the display
    """
    
    partial_corr = pg.pairwise_corr(data=corr_df, covar=covar, method=method)
    _, p_adj = pg.multicomp(partial_corr['p-unc'].values, alpha=pval, method=padjust)
    partial_corr['p-corr'] = p_adj
    
    partial_corr = partial_corr.loc[(partial_corr['p-corr'] < pval) & (~partial_corr['X'].str.contains(target)) & (partial_corr['Y'].str.contains(target))]
    partial_corr['r_ctl'] = partial_corr['r']
    partial_corr['p_ctl'] = partial_corr['p-corr']
    if covar_name is not None:
        partial_corr['covar'] = covar_name
    
    
    partial_corr = partial_corr[['X', 'Y', 'covar', 'r_ctl', 'p_ctl']]
    
    # drop the controlling covars for the raw pairwise correlation
    pairwise_corr = pg.pairwise_corr(data=corr_df.drop(covar, axis='columns'), method=method, padjust=padjust)
    pairwise_corr['r_unctl'] = pairwise_corr['r']
    pairwise_corr['p_unctl'] = pairwise_corr['p-corr']

    partial_corr = partial_corr.merge(pairwise_corr[['X', 'Y', 'r_unctl', 'p_unctl', 'n']], on=['X', 'Y'], how='left').sort_values('p_ctl')
    return partial_corr.style.set_caption(method)
示例#2
0
def display_all_corr(corr_df, cols, target, alpha=0.1, method="pearson"):
    """Displays all correlations for every feature in cols against target. 
    
    Sorts by feature group.
    Also stars the significant p-values.
    
    Args:
        corr_df (pd.df): pandas correlation frame
        cols (list): list of feature columns
        target (str): the target survey column name
        alpha (float): the specified significance level
        method (str): whether "pearson" or "spearman" correlations
    
    Returns:
        pd.df: styled correlation matrix
    
    """
    p_val_format = "{:0.4f}"
    style_p_val = lambda x: "{:0.4f}".format(x) + "*" if x < alpha else "{:0.4f}".format(x)
    pair_corr_df = pg.pairwise_corr(corr_df, columns=[cols, [target]], method=method, padjust="fdr_bh")
    pair_corr_df['p-corr'] = pair_corr_df['p-corr'].apply(style_p_val)
    pair_corr_df['p-unc'] = pair_corr_df['p-unc'].apply(style_p_val)

    title = method + ", alpha < {}".format(alpha)
    display(pair_corr_df[['X', 'Y', 'n', 'r', 'p-unc', 'p-corr', 'p-adjust']].style.set_caption(title))
示例#3
0
def build_corr_table(df, x, cols, target, title, method='pearson', groups=['all', 'no_symp', 'soc_anx', 'gen_anx', 'dep_anx']):
    """Builds a concise correlation table for the given feature target across all subgroups."""
    table = pd.DataFrame()

    for group in groups:
        if group == 'all':
            sel_df = df
        else:
            sel_df = df[df['baseline_cluster']  == group]
        corr_df = pg.pairwise_corr(sel_df, columns=[cols, [target]], nan_policy='pairwise', padjust="fdr_bh", method=method)
        corr_df['group'] = group
        #print(corr_df.head())
        corr_df = corr_df[corr_df['X'] == x]
        table = table.append(corr_df[['group', 'n', 'r', 'p-corr', 'p-adjust']])
        
    table = table.reset_index(drop=True)
        
    return table.style.set_caption(title)
示例#4
0
def run_pheno_partialcorrs(df_phenos, df_z, method='pearson'):
    df_input = pd.concat((df_phenos, df_z), axis=1)
    if method == 'pearson':
        df_out = pd.DataFrame(
            columns=['pheno', 'variable', 'coef', 'p', 'BF10'])
    else:
        df_out = pd.DataFrame(columns=['pheno', 'variable', 'coef', 'p'])
    phenos = list(df_phenos.columns)

    for pheno in phenos:
        print(pheno)
        if method == 'pearson':
            df_tmp = pd.DataFrame(index=df_z.columns,
                                  columns=['coef', 'p', 'BF10'])
        else:
            df_tmp = pd.DataFrame(index=df_z.columns, columns=['coef', 'p'])

        phenos_cov = phenos.copy()
        phenos_cov.remove(pheno)
        results = pg.pairwise_corr(data=df_input,
                                   columns=[[pheno],
                                            list(df_z.columns)],
                                   covar=phenos_cov,
                                   method=method)
        results.set_index('Y', inplace=True)
        df_tmp.loc[:, 'coef'] = results['r']
        df_tmp.loc[:, 'p'] = results['p-unc']
        if method == 'pearson':
            df_tmp.loc[:, 'BF10'] = results['BF10'].astype(float)

        # append
        df_tmp.reset_index(inplace=True)
        df_tmp.rename(index=str, columns={'index': 'variable'}, inplace=True)
        df_tmp['pheno'] = pheno
        df_out = df_out.append(df_tmp, sort=False)
    df_out.set_index(['pheno', 'variable'], inplace=True)

    return df_out
示例#5
0
    def test_pandas(self):
        """Test pandas method.
        """
        # Test the ANOVA (Pandas)
        aov = df.anova(dv='Scores', between='Group', detailed=True)
        assert aov.equals(
            pg.anova(dv='Scores', between='Group', detailed=True, data=df))
        aov3_ss1 = df_aov3.anova(dv='Cholesterol',
                                 between=['Sex', 'Drug'],
                                 ss_type=1)
        aov3_ss2 = df_aov3.anova(dv='Cholesterol',
                                 between=['Sex', 'Drug'],
                                 ss_type=2)
        aov3_ss2_pg = pg.anova(dv='Cholesterol',
                               between=['Sex', 'Drug'],
                               data=df_aov3,
                               ss_type=2)
        assert not aov3_ss1.equals(aov3_ss2)
        assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3))

        # Test the Welch ANOVA (Pandas)
        aov = df.welch_anova(dv='Scores', between='Group')
        assert aov.equals(pg.welch_anova(dv='Scores', between='Group',
                                         data=df))

        # Test the ANCOVA
        aov = df_anc.ancova(dv='Scores', covar='Income',
                            between='Method').round(3)
        assert (aov.equals(
            pg.ancova(data=df_anc,
                      dv='Scores',
                      covar='Income',
                      between='Method').round(3)))

        # Test the repeated measures ANOVA (Pandas)
        aov = df.rm_anova(dv='Scores',
                          within='Time',
                          subject='Subject',
                          detailed=True)
        assert (aov.equals(
            pg.rm_anova(dv='Scores',
                        within='Time',
                        subject='Subject',
                        detailed=True,
                        data=df)))

        # FDR-corrected post hocs with Hedges'g effect size
        ttests = df.pairwise_tests(dv='Scores',
                                   within='Time',
                                   subject='Subject',
                                   padjust='fdr_bh',
                                   effsize='hedges')
        assert (ttests.equals(
            pg.pairwise_tests(dv='Scores',
                              within='Time',
                              subject='Subject',
                              padjust='fdr_bh',
                              effsize='hedges',
                              data=df)))

        # Pairwise Tukey
        tukey = df.pairwise_tukey(dv='Scores', between='Group')
        assert tukey.equals(
            pg.pairwise_tukey(data=df, dv='Scores', between='Group'))

        # Test two-way mixed ANOVA
        aov = df.mixed_anova(dv='Scores',
                             between='Group',
                             within='Time',
                             subject='Subject',
                             correction=False)
        assert (aov.equals(
            pg.mixed_anova(dv='Scores',
                           between='Group',
                           within='Time',
                           subject='Subject',
                           correction=False,
                           data=df)))

        # Test parwise correlations
        corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman')
        corrs2 = pg.pairwise_corr(data=data,
                                  columns=['X', 'M', 'Y'],
                                  method='spearman')
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation
        corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman')
        corrs2 = pg.partial_corr(x='X',
                                 y='Y',
                                 covar='M',
                                 method='spearman',
                                 data=data)
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation matrix (compare with the ppcor package)
        corrs = data.iloc[:, :5].pcorr().round(3)
        np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(),
                                      [1, 0.392, 0.06, -0.014, -0.149])
        # Now compare against Pingouin's own partial_corr function
        corrs = data[['X', 'Y', 'M']].pcorr()
        corrs2 = data.partial_corr(x='X', y='Y', covar='M')
        assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r'])

        # Test rcorr (correlation matrix with p-values)
        # We compare against Pingouin pairwise_corr function
        corrs = df_corr.rcorr(padjust='holm', decimals=4)
        corrs2 = df_corr.pairwise_corr(padjust='holm').round(4)
        assert corrs.at['Neuroticism', 'Agreeableness'] == '*'
        assert (corrs.at['Agreeableness',
                         'Neuroticism'] == str(corrs2.at[2, 'r']))
        corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4)
        assert (corrs.at['Neuroticism',
                         'Agreeableness'] == str(corrs2.at[2,
                                                           'p-corr'].round(4)))
        corrs = df_corr.rcorr(upper='n', decimals=5)
        corrs2 = df_corr.pairwise_corr().round(5)
        assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n']
        assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r'])
        # Method = spearman does not work with Python 3.5 on Travis?
        # Instead it seems to return the Pearson correlation!
        df_corr.rcorr(method='spearman')
        df_corr.rcorr()

        # Test mediation analysis
        med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500)
        np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(),
                                      [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
示例#6
0
else:
  print('Null hypothesis does not have enough evidence to be rejected.\n Failed to reject null hypothesis.')

"""$ p < 0.005 $

Since our p value was far lesser than the statistical significance value, we can reliably reject the null hypothesis, in favour of the alternative.

Additionally, we obtained a negative t-test statistic score, which implies that the mean of international students in UK universities is greater than in US universities.

We reject $H_o : \mu_x = \mu_y$  , where<br><br>$x :$Set of UK international_students<br>$y :$ Set of US international_students

### Correlation
"""

# Table for the pair-wise correlation of features x and y
corr_data = pg.pairwise_corr(data,columns=numerical_vars)
corr_data

"""**Analyze the correlation between `teaching` and `total_score`**"""

# Plot a scatter plot between teaching and total_score
top2011 = data.loc[data['year'] == 2011].head(100)

x = top2011['total_score']
y = top2011['teaching']

plt.figure(figsize=(5, 5))
ax = sns.scatterplot(x=x, 
                   y=y)

"""Hence, we can conclude that universities with greater overall score have better teaching scores. The relation follows a roughly linear path with positive slope.
示例#7
0
                           usecols="B:Q")
io_2050rcp8p5 = pd.read_excel('genie_outpout.xlsx',
                              sheet_name='io_STATS_foram_only',
                              usecols="T:AI")
io_2100rcp8p5 = pd.read_excel('genie_outpout.xlsx',
                              sheet_name='io_STATS_foram_only',
                              usecols="AL:BA")
io_2100rcp6 = pd.read_excel('\genie_outpout.xlsx',
                            sheet_name='io_STATS_foram_only',
                            usecols="BD:BS")

###########################################################################################################################
## spearman pairwise_corr, foram vs all
#subpolar
corr_subpolar_present = pd.DataFrame(
    pg.pairwise_corr(subpolar_present, columns=['foram'], method='spearman'))
corr_subpolar_2050rcp8p5 = pd.DataFrame(
    pg.pairwise_corr(subpolar_2050rcp8p5,
                     columns=['foram.1'],
                     method='spearman'))
corr_subpolar_2100rcp8p5 = pd.DataFrame(
    pg.pairwise_corr(subpolar_2100rcp8p5,
                     columns=['foram.2'],
                     method='spearman'))
#corr_subpolar_2100rcp6    = pd.DataFrame(pg.pairwise_corr(subpolar_2100rcp6, columns=['foram.3'], method='spearman'))
#temperate
corr_temp_present = pd.DataFrame(
    pg.pairwise_corr(temp_present, columns=['foram'], method='spearman'))
corr_temp_2050rcp8p5 = pd.DataFrame(
    pg.pairwise_corr(temp_2050rcp8p5, columns=['foram.1'], method='spearman'))
corr_temp_2100rcp8p5 = pd.DataFrame(
示例#8
0
    x_pos = expand.slider("X position for the label", 0.0, max_x, start_x,
                          (max_x / 100 + 0.1))
    y_pos = expand.slider("Y position for the label", 0.0, max_y, start_y,
                          (max_y / 100 + 0.1))

    sns.set(style='white', font_scale=font_scale)

    st.success("Correlation results")

    corr_result = pg.corr(x=df[x_var1], y=df[x_var2], method=method_selected)
    st.write(corr_result)

    st.success("Correlation matrices")

    st.write(
        pg.pairwise_corr(df, padjust='bonf',
                         method=method_selected).sort_values(by=['p-unc']))

    st.write(df.rcorr(padjust='bonf'))

    st.success("Correlation plot with distributions is being generated")
    fig = plt.figure(figsize=(12, 6))
    g = sns.JointGrid(data=df, x=x_var1, y=x_var2, height=6)
    g = g.plot_joint(sns.regplot, color="xkcd:muted blue")
    g = g.plot_marginals(sns.distplot,
                         kde=False,
                         bins=12,
                         color="xkcd:bluey grey")
    if plot_r:
        g.ax_joint.text(
            x_pos,
            y_pos,
示例#9
0
import pandas as pd
import numpy as np
import pingouin as pg

io = r'Lake_v2.xls'

data = pd.read_excel(io, sheet_name=2, usecols=[2, 3, 4])
data.head()
print(len(data))
for i in range(len(data)):
    print(data.loc[i])

corr = pg.pairwise_corr(data, method='pearson')
spearman_corr = pg.pairwise_corr(data, method='spearman')
kendall_corr = pg.pairwise_corr(data, method='kendall')
bicor_corr = pg.pairwise_corr(data, method='bicor')
skipped_corr = pg.pairwise_corr(data, method='skipped')
corr = corr.append(spearman_corr)
corr = corr.append(kendall_corr)
corr = corr.append(bicor_corr)
corr = corr.append(skipped_corr)

corr.to_excel('correlation.xls')
示例#10
0
    def test_pandas(self):
        """Test pandas method.
        """
        # Test the ANOVA (Pandas)
        aov = df.anova(dv='Scores', between='Group', detailed=True)
        assert aov.equals(
            pg.anova(dv='Scores', between='Group', detailed=True, data=df))

        # Test the Welch ANOVA (Pandas)
        aov = df.welch_anova(dv='Scores', between='Group')
        assert aov.equals(pg.welch_anova(dv='Scores', between='Group',
                                         data=df))

        # Test the repeated measures ANOVA (Pandas)
        aov = df.rm_anova(dv='Scores',
                          within='Time',
                          subject='Subject',
                          detailed=True)
        assert aov.equals(
            pg.rm_anova(dv='Scores',
                        within='Time',
                        subject='Subject',
                        detailed=True,
                        data=df))

        # FDR-corrected post hocs with Hedges'g effect size
        ttests = df.pairwise_ttests(dv='Scores',
                                    within='Time',
                                    subject='Subject',
                                    padjust='fdr_bh',
                                    effsize='hedges')
        assert ttests.equals(
            pg.pairwise_ttests(dv='Scores',
                               within='Time',
                               subject='Subject',
                               padjust='fdr_bh',
                               effsize='hedges',
                               data=df))

        # Test two-way mixed ANOVA
        aov = df.mixed_anova(dv='Scores',
                             between='Group',
                             within='Time',
                             subject='Subject',
                             correction=False)
        assert aov.equals(
            pg.mixed_anova(dv='Scores',
                           between='Group',
                           within='Time',
                           subject='Subject',
                           correction=False,
                           data=df))

        # Test parwise correlations
        corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman')
        corrs2 = pg.pairwise_corr(data=data,
                                  columns=['X', 'M', 'Y'],
                                  method='spearman')
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation
        corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman')
        corrs2 = pg.partial_corr(x='X',
                                 y='Y',
                                 covar='M',
                                 method='spearman',
                                 data=data)
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation matrix (compare with the ppcor package)
        corrs = data.pcorr().round(3)
        np.testing.assert_array_equal(corrs.iloc[0, :].values,
                                      [1, 0.392, 0.06, -0.014, -0.149])
        # Now compare against Pingouin's own partial_corr function
        corrs = data[['X', 'Y', 'M']].pcorr()
        corrs2 = data.partial_corr(x='X', y='Y', covar='M')
        assert round(corrs.loc['X', 'Y'], 3) == corrs2.loc['pearson', 'r']

        # Test mediation analysis
        med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500)
        np.testing.assert_array_equal(med.loc[:, 'coef'].values,
                                      [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
columns = list(df)
features = []
for column in df:
    epiclist = df[
        column]  # epiclist = values for a certain feature for all iterations
    features.append(epiclist)

#generates a heatmap that we can use to find strongly correlated values
plt.figure(figsize=(12, 10))
cor = df.corr(method='pearson')

print("checkpoint")
print(
    pg.pairwise_corr(df).sort_values(by=['p-unc'])[[
        'X', 'Y', 'n', 'r', 'p-unc'
    ]])

print("checkpoint 2")
print(cor)

pingouin_raw_corr = pg.pairwise_corr(df).sort_values(by=['p-unc'])[[
    'X', 'Y', 'n', 'r', 'p-unc'
]]

pingouin_corr = pd.DataFrame()

for x in df.columns:
    for y in df.columns:
        df[x] = np.nan_to_num(df[x])
        df[y] = np.nan_to_num(df[y])
示例#12
0
def correlations(df, var_list, tail, correction, method, cov_list, *args):
    """
    Compute correlations por each group of variables dropping variables of the same group (i.e. if
    we have Depression_mother, Depression_father, Depression_daughter, correlations between these
    variables that are part of the same set of variables are excluded, only if we specified a common
    name of these variables ('Depression') in *args. Otherwise they are included.
    Apply a correction for multiple comparisons for all uncorrected p-values of each group of variables,
    not to all the correlations in the DataFrame. Joins everything in an unique DataFrame.

    Input:
        df: DataFame with data.
        var_list: list of lists with variables for correlations.
        tail (str): 'one-sided' or 'two-sided'.
        correction (str): check statsmodels.multitest.multipletest for correction options.
        method (str): 'pearson', 'spearman', etc. (check methods for pg.pairwise_corr).
        cov_list: list of covariates.
        *args (str): strings to exclude intra group correlations from DataFrame and apply
                     correction without these correlations. The string should be a name that
                     is common to all the names of the variables that are part of the group.

                     Specify one string for the X column of the DataFrame:
                     - 'Depression' if is the common string - Depression_mother, Depression_father, etc.
                        and we just have that group for the X column
                     - 'Depression|Anger' if we have two groups - Depression_mother, Depression_father, etc. +
                        Anger_mother, Anger_father, etc.

                     Specify another string with the same logic for the Y column of the DataFrame.
    Output:
        df: DataFrame with correlation values dropping correlations between variables of the same group.
    """

    list_of_dfs = []

    # list_to_exclude_both_rows = [excludeintra_X, excludeintra_Y, excludeintra_Z]
    df_corr = pd.DataFrame(
        pg.pairwise_corr(df,
                         columns=var_list,
                         covar=cov_list,
                         tail=tail,
                         method=method,
                         nan_policy='pairwise')).round(5)

    # drop intra task correlations. Reset index because of dropped rows
    # to ease concat
    for row_excl in args:
        # drop rows that contains the same string in both columns X and Y
        df_corr = df_corr[~(
            (df_corr['X'].str.contains(row_excl)) &
            (df_corr['Y'].str.contains(row_excl)))].reset_index(drop=True)

    # apply correction to p-values of no intratask correlations
    # transpose df as it returns two rows and multiple columns
    # and rename columns
    FDR_corr = list(
        sm.multitest.multipletests(df_corr['p-unc'],
                                   alpha=0.05,
                                   method=correction,
                                   is_sorted=False,
                                   returnsorted=False))
    # extract in a dict alpha corrected by Sidak and Bonferroni to ease concat
    alpha_SidakBonf = dict(alphacSidak=FDR_corr[2], alphacBonf=FDR_corr[3])
    # select two first elements, transpose and rename columns
    FDR_corr = pd.DataFrame(
        FDR_corr[0:2]).transpose().rename(columns={
            0: 'FDR',
            1: 'pvals_corrected'
        })
    # add columns of p-values with FDR correction to df
    df_corr_FDR = pd.concat([df_corr, FDR_corr],
                            axis=1,
                            join='outer',
                            ignore_index=False)
    # add cloumns with corrected p-values
    df_corr_FDR['alphacSidak'], df_corr_FDR['alphacBonf'] = alpha_SidakBonf[
        'alphacSidak'], alpha_SidakBonf['alphacBonf']
    # list of dfs, each df contains correlations for each frequency band
    list_of_dfs.append(df_corr_FDR)

    # concat list of dfs into 1 df
    # reset index. After concat remains the index of each df on its own rows
    df = pd.concat(list_of_dfs).reset_index(drop=True)

    return df
#Do people who use Signal use security in choosing
#instant messaging tools?
#Simple row-to-row correlation
pg.corr(x=df['Q3-17'], y=df['Q34-31'])

# In[ ]:

pg.corr(x=df['Q40-0'], y=df['Q3-16'])

# In[ ]:

corr = pg.pairwise_corr(df,
                        columns=[['Q7-7'],
                                 [
                                     'Q3-0', 'Q3-1', 'Q3-2', 'Q3-3', 'Q3-4',
                                     'Q3-5', 'Q3-6', 'Q3-7', 'Q3-8', 'Q3-9',
                                     'Q3-10', 'Q3-11', 'Q3-12', 'Q3-13',
                                     'Q3-14', 'Q3-15', 'Q3-16', 'Q3-17',
                                     'Q3-18'
                                 ]],
                        method='pearson')
corr.sort_values(by=['p-unc'])[['X', 'Y', 'n', 'r', 'p-unc']].head()
# shows that Signal, IMO and Telegram are associated with users who choose baesd on security

# In[ ]:

# It appears largest correlations between tool usage and why it was selected are for security, features and work/school
corr = pg.pairwise_corr(df,
                        columns=[[
                            'Q7-0', 'Q7-1', 'Q7-2', 'Q7-3', 'Q7-4', 'Q7-5',
                            'Q7-6', 'Q7-7', 'Q7-8', 'Q7-9', 'Q7-10'
                        ],
示例#14
0
#Plot histogram of data:
data.hist(bins=50, figsize=(18,14))
#Plot scatter matrix of data:
scatter_matrix(data, figsize=(18,14))
#Find correlation matrix of data:
corr_matrix = data.corr()
#Print out correlations of Output w.r.t. others:
print(corr_matrix[data.columns[0]].sort_values(ascending=False))


# In[20]:


import pingouin as pg
pg.pairwise_corr(data).sort_values(by=['p-unc'])[['X', 'Y', 'n', 'r', 'p-unc']].head()


# In[21]:


#Pairwise correlation
import pingouin as pg
from pingouin import pairwise_corr, read_dataset
data[['Output (X0)', 'Tourism Score (X2)', 'Economic Score (X1)','SARS Score (X3)', 'Human Freedom Score']].pcorr()


# In[22]:


# Partial Correlations Matrix of variables