def build_partial_corr(corr_df, target, covar, method='pearson', padjust='fdr_bh', pval=0.05, covar_name=None): """ Builds partial correlation DataFrame from corr_df of the target survey, controlling for covar. corr_df (pd.DataFrame): correlation frame, assuming each row is an observation target (str): targe column, can be a string prefix or suffix covar (list): a list of covariates to control for covar_name (str): optional name for covariates in the display """ partial_corr = pg.pairwise_corr(data=corr_df, covar=covar, method=method) _, p_adj = pg.multicomp(partial_corr['p-unc'].values, alpha=pval, method=padjust) partial_corr['p-corr'] = p_adj partial_corr = partial_corr.loc[(partial_corr['p-corr'] < pval) & (~partial_corr['X'].str.contains(target)) & (partial_corr['Y'].str.contains(target))] partial_corr['r_ctl'] = partial_corr['r'] partial_corr['p_ctl'] = partial_corr['p-corr'] if covar_name is not None: partial_corr['covar'] = covar_name partial_corr = partial_corr[['X', 'Y', 'covar', 'r_ctl', 'p_ctl']] # drop the controlling covars for the raw pairwise correlation pairwise_corr = pg.pairwise_corr(data=corr_df.drop(covar, axis='columns'), method=method, padjust=padjust) pairwise_corr['r_unctl'] = pairwise_corr['r'] pairwise_corr['p_unctl'] = pairwise_corr['p-corr'] partial_corr = partial_corr.merge(pairwise_corr[['X', 'Y', 'r_unctl', 'p_unctl', 'n']], on=['X', 'Y'], how='left').sort_values('p_ctl') return partial_corr.style.set_caption(method)
def display_all_corr(corr_df, cols, target, alpha=0.1, method="pearson"): """Displays all correlations for every feature in cols against target. Sorts by feature group. Also stars the significant p-values. Args: corr_df (pd.df): pandas correlation frame cols (list): list of feature columns target (str): the target survey column name alpha (float): the specified significance level method (str): whether "pearson" or "spearman" correlations Returns: pd.df: styled correlation matrix """ p_val_format = "{:0.4f}" style_p_val = lambda x: "{:0.4f}".format(x) + "*" if x < alpha else "{:0.4f}".format(x) pair_corr_df = pg.pairwise_corr(corr_df, columns=[cols, [target]], method=method, padjust="fdr_bh") pair_corr_df['p-corr'] = pair_corr_df['p-corr'].apply(style_p_val) pair_corr_df['p-unc'] = pair_corr_df['p-unc'].apply(style_p_val) title = method + ", alpha < {}".format(alpha) display(pair_corr_df[['X', 'Y', 'n', 'r', 'p-unc', 'p-corr', 'p-adjust']].style.set_caption(title))
def build_corr_table(df, x, cols, target, title, method='pearson', groups=['all', 'no_symp', 'soc_anx', 'gen_anx', 'dep_anx']): """Builds a concise correlation table for the given feature target across all subgroups.""" table = pd.DataFrame() for group in groups: if group == 'all': sel_df = df else: sel_df = df[df['baseline_cluster'] == group] corr_df = pg.pairwise_corr(sel_df, columns=[cols, [target]], nan_policy='pairwise', padjust="fdr_bh", method=method) corr_df['group'] = group #print(corr_df.head()) corr_df = corr_df[corr_df['X'] == x] table = table.append(corr_df[['group', 'n', 'r', 'p-corr', 'p-adjust']]) table = table.reset_index(drop=True) return table.style.set_caption(title)
def run_pheno_partialcorrs(df_phenos, df_z, method='pearson'): df_input = pd.concat((df_phenos, df_z), axis=1) if method == 'pearson': df_out = pd.DataFrame( columns=['pheno', 'variable', 'coef', 'p', 'BF10']) else: df_out = pd.DataFrame(columns=['pheno', 'variable', 'coef', 'p']) phenos = list(df_phenos.columns) for pheno in phenos: print(pheno) if method == 'pearson': df_tmp = pd.DataFrame(index=df_z.columns, columns=['coef', 'p', 'BF10']) else: df_tmp = pd.DataFrame(index=df_z.columns, columns=['coef', 'p']) phenos_cov = phenos.copy() phenos_cov.remove(pheno) results = pg.pairwise_corr(data=df_input, columns=[[pheno], list(df_z.columns)], covar=phenos_cov, method=method) results.set_index('Y', inplace=True) df_tmp.loc[:, 'coef'] = results['r'] df_tmp.loc[:, 'p'] = results['p-unc'] if method == 'pearson': df_tmp.loc[:, 'BF10'] = results['BF10'].astype(float) # append df_tmp.reset_index(inplace=True) df_tmp.rename(index=str, columns={'index': 'variable'}, inplace=True) df_tmp['pheno'] = pheno df_out = df_out.append(df_tmp, sort=False) df_out.set_index(['pheno', 'variable'], inplace=True) return df_out
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) aov3_ss1 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=1) aov3_ss2 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=2) aov3_ss2_pg = pg.anova(dv='Cholesterol', between=['Sex', 'Drug'], data=df_aov3, ss_type=2) assert not aov3_ss1.equals(aov3_ss2) assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the ANCOVA aov = df_anc.ancova(dv='Scores', covar='Income', between='Method').round(3) assert (aov.equals( pg.ancova(data=df_anc, dv='Scores', covar='Income', between='Method').round(3))) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert (aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df))) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert (ttests.equals( pg.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df))) # Pairwise Tukey tukey = df.pairwise_tukey(dv='Scores', between='Group') assert tukey.equals( pg.pairwise_tukey(data=df, dv='Scores', between='Group')) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert (aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df))) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.iloc[:, :5].pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(), [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r']) # Test rcorr (correlation matrix with p-values) # We compare against Pingouin pairwise_corr function corrs = df_corr.rcorr(padjust='holm', decimals=4) corrs2 = df_corr.pairwise_corr(padjust='holm').round(4) assert corrs.at['Neuroticism', 'Agreeableness'] == '*' assert (corrs.at['Agreeableness', 'Neuroticism'] == str(corrs2.at[2, 'r'])) corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4) assert (corrs.at['Neuroticism', 'Agreeableness'] == str(corrs2.at[2, 'p-corr'].round(4))) corrs = df_corr.rcorr(upper='n', decimals=5) corrs2 = df_corr.pairwise_corr().round(5) assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n'] assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r']) # Method = spearman does not work with Python 3.5 on Travis? # Instead it seems to return the Pearson correlation! df_corr.rcorr(method='spearman') df_corr.rcorr() # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(), [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
else: print('Null hypothesis does not have enough evidence to be rejected.\n Failed to reject null hypothesis.') """$ p < 0.005 $ Since our p value was far lesser than the statistical significance value, we can reliably reject the null hypothesis, in favour of the alternative. Additionally, we obtained a negative t-test statistic score, which implies that the mean of international students in UK universities is greater than in US universities. We reject $H_o : \mu_x = \mu_y$ , where<br><br>$x :$Set of UK international_students<br>$y :$ Set of US international_students ### Correlation """ # Table for the pair-wise correlation of features x and y corr_data = pg.pairwise_corr(data,columns=numerical_vars) corr_data """**Analyze the correlation between `teaching` and `total_score`**""" # Plot a scatter plot between teaching and total_score top2011 = data.loc[data['year'] == 2011].head(100) x = top2011['total_score'] y = top2011['teaching'] plt.figure(figsize=(5, 5)) ax = sns.scatterplot(x=x, y=y) """Hence, we can conclude that universities with greater overall score have better teaching scores. The relation follows a roughly linear path with positive slope.
usecols="B:Q") io_2050rcp8p5 = pd.read_excel('genie_outpout.xlsx', sheet_name='io_STATS_foram_only', usecols="T:AI") io_2100rcp8p5 = pd.read_excel('genie_outpout.xlsx', sheet_name='io_STATS_foram_only', usecols="AL:BA") io_2100rcp6 = pd.read_excel('\genie_outpout.xlsx', sheet_name='io_STATS_foram_only', usecols="BD:BS") ########################################################################################################################### ## spearman pairwise_corr, foram vs all #subpolar corr_subpolar_present = pd.DataFrame( pg.pairwise_corr(subpolar_present, columns=['foram'], method='spearman')) corr_subpolar_2050rcp8p5 = pd.DataFrame( pg.pairwise_corr(subpolar_2050rcp8p5, columns=['foram.1'], method='spearman')) corr_subpolar_2100rcp8p5 = pd.DataFrame( pg.pairwise_corr(subpolar_2100rcp8p5, columns=['foram.2'], method='spearman')) #corr_subpolar_2100rcp6 = pd.DataFrame(pg.pairwise_corr(subpolar_2100rcp6, columns=['foram.3'], method='spearman')) #temperate corr_temp_present = pd.DataFrame( pg.pairwise_corr(temp_present, columns=['foram'], method='spearman')) corr_temp_2050rcp8p5 = pd.DataFrame( pg.pairwise_corr(temp_2050rcp8p5, columns=['foram.1'], method='spearman')) corr_temp_2100rcp8p5 = pd.DataFrame(
x_pos = expand.slider("X position for the label", 0.0, max_x, start_x, (max_x / 100 + 0.1)) y_pos = expand.slider("Y position for the label", 0.0, max_y, start_y, (max_y / 100 + 0.1)) sns.set(style='white', font_scale=font_scale) st.success("Correlation results") corr_result = pg.corr(x=df[x_var1], y=df[x_var2], method=method_selected) st.write(corr_result) st.success("Correlation matrices") st.write( pg.pairwise_corr(df, padjust='bonf', method=method_selected).sort_values(by=['p-unc'])) st.write(df.rcorr(padjust='bonf')) st.success("Correlation plot with distributions is being generated") fig = plt.figure(figsize=(12, 6)) g = sns.JointGrid(data=df, x=x_var1, y=x_var2, height=6) g = g.plot_joint(sns.regplot, color="xkcd:muted blue") g = g.plot_marginals(sns.distplot, kde=False, bins=12, color="xkcd:bluey grey") if plot_r: g.ax_joint.text( x_pos, y_pos,
import pandas as pd import numpy as np import pingouin as pg io = r'Lake_v2.xls' data = pd.read_excel(io, sheet_name=2, usecols=[2, 3, 4]) data.head() print(len(data)) for i in range(len(data)): print(data.loc[i]) corr = pg.pairwise_corr(data, method='pearson') spearman_corr = pg.pairwise_corr(data, method='spearman') kendall_corr = pg.pairwise_corr(data, method='kendall') bicor_corr = pg.pairwise_corr(data, method='bicor') skipped_corr = pg.pairwise_corr(data, method='skipped') corr = corr.append(spearman_corr) corr = corr.append(kendall_corr) corr = corr.append(bicor_corr) corr = corr.append(skipped_corr) corr.to_excel('correlation.xls')
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df)) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_ttests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert ttests.equals( pg.pairwise_ttests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df)) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df)) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].values, [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert round(corrs.loc['X', 'Y'], 3) == corrs2.loc['pearson', 'r'] # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].values, [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
columns = list(df) features = [] for column in df: epiclist = df[ column] # epiclist = values for a certain feature for all iterations features.append(epiclist) #generates a heatmap that we can use to find strongly correlated values plt.figure(figsize=(12, 10)) cor = df.corr(method='pearson') print("checkpoint") print( pg.pairwise_corr(df).sort_values(by=['p-unc'])[[ 'X', 'Y', 'n', 'r', 'p-unc' ]]) print("checkpoint 2") print(cor) pingouin_raw_corr = pg.pairwise_corr(df).sort_values(by=['p-unc'])[[ 'X', 'Y', 'n', 'r', 'p-unc' ]] pingouin_corr = pd.DataFrame() for x in df.columns: for y in df.columns: df[x] = np.nan_to_num(df[x]) df[y] = np.nan_to_num(df[y])
def correlations(df, var_list, tail, correction, method, cov_list, *args): """ Compute correlations por each group of variables dropping variables of the same group (i.e. if we have Depression_mother, Depression_father, Depression_daughter, correlations between these variables that are part of the same set of variables are excluded, only if we specified a common name of these variables ('Depression') in *args. Otherwise they are included. Apply a correction for multiple comparisons for all uncorrected p-values of each group of variables, not to all the correlations in the DataFrame. Joins everything in an unique DataFrame. Input: df: DataFame with data. var_list: list of lists with variables for correlations. tail (str): 'one-sided' or 'two-sided'. correction (str): check statsmodels.multitest.multipletest for correction options. method (str): 'pearson', 'spearman', etc. (check methods for pg.pairwise_corr). cov_list: list of covariates. *args (str): strings to exclude intra group correlations from DataFrame and apply correction without these correlations. The string should be a name that is common to all the names of the variables that are part of the group. Specify one string for the X column of the DataFrame: - 'Depression' if is the common string - Depression_mother, Depression_father, etc. and we just have that group for the X column - 'Depression|Anger' if we have two groups - Depression_mother, Depression_father, etc. + Anger_mother, Anger_father, etc. Specify another string with the same logic for the Y column of the DataFrame. Output: df: DataFrame with correlation values dropping correlations between variables of the same group. """ list_of_dfs = [] # list_to_exclude_both_rows = [excludeintra_X, excludeintra_Y, excludeintra_Z] df_corr = pd.DataFrame( pg.pairwise_corr(df, columns=var_list, covar=cov_list, tail=tail, method=method, nan_policy='pairwise')).round(5) # drop intra task correlations. Reset index because of dropped rows # to ease concat for row_excl in args: # drop rows that contains the same string in both columns X and Y df_corr = df_corr[~( (df_corr['X'].str.contains(row_excl)) & (df_corr['Y'].str.contains(row_excl)))].reset_index(drop=True) # apply correction to p-values of no intratask correlations # transpose df as it returns two rows and multiple columns # and rename columns FDR_corr = list( sm.multitest.multipletests(df_corr['p-unc'], alpha=0.05, method=correction, is_sorted=False, returnsorted=False)) # extract in a dict alpha corrected by Sidak and Bonferroni to ease concat alpha_SidakBonf = dict(alphacSidak=FDR_corr[2], alphacBonf=FDR_corr[3]) # select two first elements, transpose and rename columns FDR_corr = pd.DataFrame( FDR_corr[0:2]).transpose().rename(columns={ 0: 'FDR', 1: 'pvals_corrected' }) # add columns of p-values with FDR correction to df df_corr_FDR = pd.concat([df_corr, FDR_corr], axis=1, join='outer', ignore_index=False) # add cloumns with corrected p-values df_corr_FDR['alphacSidak'], df_corr_FDR['alphacBonf'] = alpha_SidakBonf[ 'alphacSidak'], alpha_SidakBonf['alphacBonf'] # list of dfs, each df contains correlations for each frequency band list_of_dfs.append(df_corr_FDR) # concat list of dfs into 1 df # reset index. After concat remains the index of each df on its own rows df = pd.concat(list_of_dfs).reset_index(drop=True) return df
#Do people who use Signal use security in choosing #instant messaging tools? #Simple row-to-row correlation pg.corr(x=df['Q3-17'], y=df['Q34-31']) # In[ ]: pg.corr(x=df['Q40-0'], y=df['Q3-16']) # In[ ]: corr = pg.pairwise_corr(df, columns=[['Q7-7'], [ 'Q3-0', 'Q3-1', 'Q3-2', 'Q3-3', 'Q3-4', 'Q3-5', 'Q3-6', 'Q3-7', 'Q3-8', 'Q3-9', 'Q3-10', 'Q3-11', 'Q3-12', 'Q3-13', 'Q3-14', 'Q3-15', 'Q3-16', 'Q3-17', 'Q3-18' ]], method='pearson') corr.sort_values(by=['p-unc'])[['X', 'Y', 'n', 'r', 'p-unc']].head() # shows that Signal, IMO and Telegram are associated with users who choose baesd on security # In[ ]: # It appears largest correlations between tool usage and why it was selected are for security, features and work/school corr = pg.pairwise_corr(df, columns=[[ 'Q7-0', 'Q7-1', 'Q7-2', 'Q7-3', 'Q7-4', 'Q7-5', 'Q7-6', 'Q7-7', 'Q7-8', 'Q7-9', 'Q7-10' ],
#Plot histogram of data: data.hist(bins=50, figsize=(18,14)) #Plot scatter matrix of data: scatter_matrix(data, figsize=(18,14)) #Find correlation matrix of data: corr_matrix = data.corr() #Print out correlations of Output w.r.t. others: print(corr_matrix[data.columns[0]].sort_values(ascending=False)) # In[20]: import pingouin as pg pg.pairwise_corr(data).sort_values(by=['p-unc'])[['X', 'Y', 'n', 'r', 'p-unc']].head() # In[21]: #Pairwise correlation import pingouin as pg from pingouin import pairwise_corr, read_dataset data[['Output (X0)', 'Tourism Score (X2)', 'Economic Score (X1)','SARS Score (X3)', 'Human Freedom Score']].pcorr() # In[22]: # Partial Correlations Matrix of variables