Exemplo n.º 1
0
def _varAnalysis(df, labels):
    """
    """
    from scipy.stats import levene
    if df.shape[0] != len(labels):
        raise ValueError(
            "The number of input samples is not equal to labels size")
        return 0

    label_ = np.unique(labels)
    groups = _split(df, labels)
    if len(label_) == 2:
        print('Performing t-test analysis...')
        from scipy.stats import ttest_ind
        F, P = [], []
        for i in range(df.shape[1]):
            sample = [item[:, i] for item in groups]
            stat, p = levene(*sample)
            if p < 0.05:
                f, p = ttest_ind(*sample, equal_var=False)
            else:
                f, p = ttest_ind(*sample, equal_var=True)
            F.append(f)
            P.append(p)

    elif len(label_) > 2:
        print('Performing anova analysis...')
        F, P = [], []
        for i in range(df.shape[1]):
            sample = [item[:, i] for item in groups]
            stat, p = levene(*sample)
            if p < 0.05:
                from pingouin import welch_anova
                meta = pd.DataFrame(df.iloc[:, i])
                meta.columns = ['feature']
                meta['labels'] = labels
                result = welch_anova(data=meta, dv='feature', between='labels')
                f = result['F'].values[0]
                p = result['p-unc'].values[0]
            else:
                from scipy.stats import f_oneway
                f, p = f_oneway(*sample)
            F.append(f)
            P.append(p)
    else:
        raise ValueError("Groups for comparison are less than 2!")
    F = pd.DataFrame(F)
    P = pd.DataFrame(P)
    F.index = df.columns
    P.index = df.columns
    return F, P
Exemplo n.º 2
0
def corr_to_target(
        df : pd.DataFrame,
        target : Any,
        cat_features : Any,
    ) -> pd.DataFrame:
    """
    Determine correlation of target feature to all other features with an effect size
    that is comparable between categorical and numerical features. 

    Arguments:
        df :
            Data
        target :
            Target feature key in DataFrame with respect to which correlations are determined
        cat_features :
            Keys for categorical features in DataFrame
    Returns:
        DataFrame with correlations measures

    """
    result = defaultdict(dict)
    cat_target = target in cat_features
    num_target = not cat_target
    for f in df.columns.drop(target):
        data = df[[f, target]].dropna()
        cat_feature = f in cat_features
        num_feature = not cat_feature
        result['Categorical'][f] = int(cat_feature)
        if cat_target and cat_feature:
            pass
        elif cat_target and num_feature:
            pass
        elif num_target and cat_feature:
            vc = data[f].value_counts()
            if vc.min() > 1:
                n2, pval = pg.welch_anova(data=data, dv=target, between=f).loc[0,['np2','p-unc']].values
                result['R2'][f] = n2
                result['R2_adj'][f] = r2_adjusted(n2,len(data),len(vc))
                result['pval'][f] = pval
            #mi = mutual_info_classif(data[[target]], data[f])[0]
            #result['MI'][f] = mi
        elif num_target and num_feature:
            r, pval = stats.pearsonr(data[f], data[target])
            result['R2'][f] = r**2
            result['R2_adj'][f] = r2_adjusted(r**2, len(data), 1)
            result['pval'][f] = pval
            #mi = mutual_info_regression(data[[f]], data[target])[0]
            #result['MI'][f] = mi

    return pd.DataFrame(result)
 def welch_f_test(self, stacked_df, dependentvar, groupname):
     '''
     input:
     stacked_data: a dataframe from table_transform_function
     dependentvar: value to compare 
     groupname: names of group to split 
     return: 
     perform welch F test to check if there are differences present 
     '''
     # alpha is 0.05
     p_value = pingouin.welch_anova(dependentvar, groupname,
                                    stacked_df)['p-unc'][0]
     if p_value > 0.05:
         return 'fail to reject Null Hypothesis'
     else:
         return 'reject Null Hypothesis'
Exemplo n.º 4
0
epms1_clean, open_out = detec_outlier(df=epms1_clean,
                                      var_name='Time in Zone (%) - Open Arms',
                                      var_group='Subject Group')
open_out
epm_s1[epm_s1['Entries in Zone - Center'].between(0, 5)]
# Stats Session 1
#Entries is normally distributed, anova
entries_anova = pg.anova(data=epms1_clean,
                         dv='Entries in Zone - Center',
                         between='Subject Group')
entries_anova.to_csv(
    '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/cross_center_anova.csv'
)
# Total distance is not normal and fail Levenne; Welch anova
dist_Welch = pg.welch_anova(data=epms1_clean,
                            dv='Total Distance',
                            between='Subject Group')
dist_Welch
tdist_ph = pg.pairwise_gameshowell(data=epms1_clean,
                                   dv='Total Distance',
                                   between='Subject Group')
dist_Welch.to_csv(
    '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/total_dist_Welch.csv'
)
tdist_ph.to_csv(
    '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/total_dist_ph.csv')
pg.homoscedasticity(data=epms1_clean,
                    dv='Time in Zone (%) - Open Arms',
                    group='Subject Group')

# Open Arms ANOVA
Exemplo n.º 5
0
    def test_pandas(self):
        """Test pandas method.
        """
        # Test the ANOVA (Pandas)
        aov = df.anova(dv='Scores', between='Group', detailed=True)
        assert aov.equals(
            pg.anova(dv='Scores', between='Group', detailed=True, data=df))
        aov3_ss1 = df_aov3.anova(dv='Cholesterol',
                                 between=['Sex', 'Drug'],
                                 ss_type=1)
        aov3_ss2 = df_aov3.anova(dv='Cholesterol',
                                 between=['Sex', 'Drug'],
                                 ss_type=2)
        aov3_ss2_pg = pg.anova(dv='Cholesterol',
                               between=['Sex', 'Drug'],
                               data=df_aov3,
                               ss_type=2)
        assert not aov3_ss1.equals(aov3_ss2)
        assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3))

        # Test the Welch ANOVA (Pandas)
        aov = df.welch_anova(dv='Scores', between='Group')
        assert aov.equals(pg.welch_anova(dv='Scores', between='Group',
                                         data=df))

        # Test the ANCOVA
        aov = df_anc.ancova(dv='Scores', covar='Income',
                            between='Method').round(3)
        assert (aov.equals(
            pg.ancova(data=df_anc,
                      dv='Scores',
                      covar='Income',
                      between='Method').round(3)))

        # Test the repeated measures ANOVA (Pandas)
        aov = df.rm_anova(dv='Scores',
                          within='Time',
                          subject='Subject',
                          detailed=True)
        assert (aov.equals(
            pg.rm_anova(dv='Scores',
                        within='Time',
                        subject='Subject',
                        detailed=True,
                        data=df)))

        # FDR-corrected post hocs with Hedges'g effect size
        ttests = df.pairwise_tests(dv='Scores',
                                   within='Time',
                                   subject='Subject',
                                   padjust='fdr_bh',
                                   effsize='hedges')
        assert (ttests.equals(
            pg.pairwise_tests(dv='Scores',
                              within='Time',
                              subject='Subject',
                              padjust='fdr_bh',
                              effsize='hedges',
                              data=df)))

        # Pairwise Tukey
        tukey = df.pairwise_tukey(dv='Scores', between='Group')
        assert tukey.equals(
            pg.pairwise_tukey(data=df, dv='Scores', between='Group'))

        # Test two-way mixed ANOVA
        aov = df.mixed_anova(dv='Scores',
                             between='Group',
                             within='Time',
                             subject='Subject',
                             correction=False)
        assert (aov.equals(
            pg.mixed_anova(dv='Scores',
                           between='Group',
                           within='Time',
                           subject='Subject',
                           correction=False,
                           data=df)))

        # Test parwise correlations
        corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman')
        corrs2 = pg.pairwise_corr(data=data,
                                  columns=['X', 'M', 'Y'],
                                  method='spearman')
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation
        corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman')
        corrs2 = pg.partial_corr(x='X',
                                 y='Y',
                                 covar='M',
                                 method='spearman',
                                 data=data)
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation matrix (compare with the ppcor package)
        corrs = data.iloc[:, :5].pcorr().round(3)
        np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(),
                                      [1, 0.392, 0.06, -0.014, -0.149])
        # Now compare against Pingouin's own partial_corr function
        corrs = data[['X', 'Y', 'M']].pcorr()
        corrs2 = data.partial_corr(x='X', y='Y', covar='M')
        assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r'])

        # Test rcorr (correlation matrix with p-values)
        # We compare against Pingouin pairwise_corr function
        corrs = df_corr.rcorr(padjust='holm', decimals=4)
        corrs2 = df_corr.pairwise_corr(padjust='holm').round(4)
        assert corrs.at['Neuroticism', 'Agreeableness'] == '*'
        assert (corrs.at['Agreeableness',
                         'Neuroticism'] == str(corrs2.at[2, 'r']))
        corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4)
        assert (corrs.at['Neuroticism',
                         'Agreeableness'] == str(corrs2.at[2,
                                                           'p-corr'].round(4)))
        corrs = df_corr.rcorr(upper='n', decimals=5)
        corrs2 = df_corr.pairwise_corr().round(5)
        assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n']
        assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r'])
        # Method = spearman does not work with Python 3.5 on Travis?
        # Instead it seems to return the Pearson correlation!
        df_corr.rcorr(method='spearman')
        df_corr.rcorr()

        # Test mediation analysis
        med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500)
        np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(),
                                      [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
Exemplo n.º 6
0
    def test_pandas(self):
        """Test pandas method.
        """
        # Test the ANOVA (Pandas)
        aov = df.anova(dv='Scores', between='Group', detailed=True)
        assert aov.equals(
            pg.anova(dv='Scores', between='Group', detailed=True, data=df))

        # Test the Welch ANOVA (Pandas)
        aov = df.welch_anova(dv='Scores', between='Group')
        assert aov.equals(pg.welch_anova(dv='Scores', between='Group',
                                         data=df))

        # Test the repeated measures ANOVA (Pandas)
        aov = df.rm_anova(dv='Scores',
                          within='Time',
                          subject='Subject',
                          detailed=True)
        assert aov.equals(
            pg.rm_anova(dv='Scores',
                        within='Time',
                        subject='Subject',
                        detailed=True,
                        data=df))

        # FDR-corrected post hocs with Hedges'g effect size
        ttests = df.pairwise_ttests(dv='Scores',
                                    within='Time',
                                    subject='Subject',
                                    padjust='fdr_bh',
                                    effsize='hedges')
        assert ttests.equals(
            pg.pairwise_ttests(dv='Scores',
                               within='Time',
                               subject='Subject',
                               padjust='fdr_bh',
                               effsize='hedges',
                               data=df))

        # Test two-way mixed ANOVA
        aov = df.mixed_anova(dv='Scores',
                             between='Group',
                             within='Time',
                             subject='Subject',
                             correction=False)
        assert aov.equals(
            pg.mixed_anova(dv='Scores',
                           between='Group',
                           within='Time',
                           subject='Subject',
                           correction=False,
                           data=df))

        # Test parwise correlations
        corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman')
        corrs2 = pg.pairwise_corr(data=data,
                                  columns=['X', 'M', 'Y'],
                                  method='spearman')
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation
        corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman')
        corrs2 = pg.partial_corr(x='X',
                                 y='Y',
                                 covar='M',
                                 method='spearman',
                                 data=data)
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation matrix (compare with the ppcor package)
        corrs = data.pcorr().round(3)
        np.testing.assert_array_equal(corrs.iloc[0, :].values,
                                      [1, 0.392, 0.06, -0.014, -0.149])
        # Now compare against Pingouin's own partial_corr function
        corrs = data[['X', 'Y', 'M']].pcorr()
        corrs2 = data.partial_corr(x='X', y='Y', covar='M')
        assert round(corrs.loc['X', 'Y'], 3) == corrs2.loc['pearson', 'r']

        # Test mediation analysis
        med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500)
        np.testing.assert_array_equal(med.loc[:, 'coef'].values,
                                      [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
Exemplo n.º 7
0
                    x='standarized_prediction',
                    data=summary_frame)
_ = plt.axhline(y=0)

# #### # This graph can be used for testing homogeneity of variance. We encountered this kind of plot previously; essentially, if it has a funnel shape then we’re in trouble. The plot we have shows points that are equally spread for the three groups, which implies that variances are similar across groups (which was also the conclusion reached by Levene’s test).

# In[14]:

_ = pg.qqplot(summary_frame['standard_resid'], confidence=False)

# #### # The second plot is a Q-Q plot , which tells us something about the normality of residuals in the model. We want our residuals to be normally distributed, which means that the dots on the graph should cling  to the diagonal line. Ours look like they have had a bit of an argument with the diagonal line, which suggests that we may not be able to assume normality of errors and should perhaps use a robust version of ANOVA instead.

# In[15]:

# Doing Welch anova in the case if homogeniety of variance  is violated(our data here dont need this test)
aov = pg.welch_anova(dv='libido', between='dose', data=df)
aov

# ## Robust ANOVA (for independent samples)

# In[16]:

st.kruskal(df_dose1['libido'], df_dose2['libido'], df_dose3['libido'])

# # Planned Comparison

# #### https://www.statsmodels.org/devel/examples/notebooks/generated/contrasts.html#examples-notebooks-generated-contrasts--page-root

# In[17]:

contrast1 = [-2, 1, 1]
import researchpy as rp
import matplotlib.pyplot as plt
from pingouin import welch_anova, read_dataset

df = pd.read_csv('imdb_data_clean.csv', delimiter=';')

### BAR PLOTS OF THREE GROUPS FOR A VARIABLE

# Subset three groups
usa = df[df['productionlocation'] == 'USA']['rating']
coprod = df[df['productionlocation'] == 'Coproduction']['rating']
nonusa = df[df['productionlocation'] == 'Non-USA']['rating']

means = (usa.mean(), coprod.mean(), nonusa.mean())  # Calculating means
std = (usa.std(), coprod.std(), nonusa.std()
       )  # Calculating standard deviations
positions = [0, 1, 2]  # Defining positions in the graph
plt.bar(positions, means, yerr=std)  # Compiling the plot
plt.xticks(positions, ['USA', 'Coproduction', 'Non-USA'],
           rotation="horizontal")  # Adding labels
plt.savefig("barmeanstd3+.pdf")  # Save figure
plt.clf()  # Clear figure

# Get descriptive table by category
print(rp.summary_cont(df['rating'].groupby(df['productionlocation'])))
pd.set_option('max_columns', 9999)

# Welch's ANOVA
aov = welch_anova(dv='rating', between='productionlocation', data=df)
print(aov)