예제 #1
0
파일: stats.py 프로젝트: StongeEtienne/dipy
    def run(self, h5_files, no_disks=100, out_dir=''):
        """Workflow of linear Mixed Models.

        Applies linear Mixed Models on bundles of subjects and saves the
        results in a directory specified by ``out_dir``.

        Parameters
        ----------

        h5_files : string
            Path to the input metric files. This path may
            contain wildcards to process multiple inputs at once.

        no_disks : integer, optional
            Number of disks used for dividing bundle into disks. (Default 100)

        out_dir : string, optional
            Output directory (default input file directory)

        """

        io_it = self.get_io_iterator()

        for file_path in io_it:

            logging.info('Applying metric {0}'.format(file_path))
            file_name = os.path.basename(file_path)[:-3]
            df = pd.read_hdf(file_path)

            if len(df) < 100:
                raise ValueError("Dataset for Linear Mixed Model is too small")

            all_bundles = df.bundle.unique()
            # all_pvalues = []
            for bundle in all_bundles:
                sub_af = df[df['bundle'] == bundle]  # sub sample
                pvalues = np.zeros(no_disks)

                # run mixed linear model for every disk
                for i in range(no_disks):

                    sub = sub_af[sub_af['disk#'] == (i+1)]  # disk number

                    if len(sub) > 0:
                        criteria = file_name + " ~ group"
                        md = smf.mixedlm(criteria, sub, groups=sub["subject"])

                        mdf = md.fit()

                        pvalues[i] = mdf.pvalues[1]

                x = list(range(1, len(pvalues)+1))
                y = -1*np.log10(pvalues)

                title = bundle + " on " + file_name + " Values"
                plot_file = os.path.join(out_dir, bundle + "_" +
                                         file_name + ".png")

                simple_plot(plot_file, title, x, y, "disk no",
                            "-log10(pvalues)")
예제 #2
0
    def test_formulas(self):
        np.random.seed(2410)
        exog = np.random.normal(size=(300, 4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = exog_re * np.kron(np.random.normal(size=100),
                                     [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        # test the names
        assert_(mod1.data.xnames == ["x1", "x2", "x3", "x4"])
        assert_(mod1.data.exog_re_names == ["x_re1"])
        assert_(mod1.data.exog_re_names_full == ["x_re1 RE"])
        rslt1 = mod1.fit()

        # Fit with a formula, passing groups as the actual values.
        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:, k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups=groups)

        assert_(mod2.data.xnames == ["exog0", "exog1", "exog2", "exog3"])
        assert_(mod2.data.exog_re_names == ["exog_re"])
        assert_(mod2.data.exog_re_names_full == ["exog_re RE"])

        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Fit with a formula, passing groups as the variable name.
        df["groups"] = groups
        mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups="groups")
        assert_(mod3.data.xnames == ["exog0", "exog1", "exog2", "exog3"])
        assert_(mod3.data.exog_re_names == ["exog_re"])
        assert_(mod3.data.exog_re_names_full == ["exog_re RE"])

        rslt3 = mod3.fit(start_params=rslt2.params)
        assert_allclose(rslt1.params, rslt3.params, rtol=1e-4)

        # Check default variance structure with non-formula model
        # creation, also use different exog_re that produces a zero
        # estimated variance parameter.
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod4 = MixedLM(endog, exog, groups, exog_re)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            rslt4 = mod4.fit()
        from statsmodels.formula.api import mixedlm
        mod5 = mixedlm(fml, df, groups="groups")
        assert_(mod5.data.exog_re_names == ["groups"])
        assert_(mod5.data.exog_re_names_full == ["groups RE"])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            rslt5 = mod5.fit()
        assert_almost_equal(rslt4.params, rslt5.params)
예제 #3
0
    def test_formulas(self):

        np.random.seed(2410)
        exog = np.random.normal(size=(300, 4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        rslt1 = mod1.fit()

        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:, k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups)
        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Check default variance structure, with formula.api
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod3 = MixedLM(endog, exog, groups, exog_re)
        rslt3 = mod3.fit()
        from statsmodels.formula.api import mixedlm

        mod4 = mixedlm(fml, df, groups=groups)
        rslt4 = mod4.fit()
        assert_almost_equal(rslt3.params, rslt4.params)
def analyze_mixed_effects(data, effects, cov_name='group', data_name='data',
                          **kwargs):
    betas = {}
    pvalues = {}
    conf_int = {}
    effects = list(effects)  # copy
    # Build data
    data = _categorize_data(data, [data_name] + effects, **kwargs)
    for effect in effects:
        # Move axis to put effect in first position
        effects.remove(effect)
        effects.insert(0, effect)

        # Take 2 different values as ref.
        targets = np.unique(data[effect].values)

        model = mixedlm(
            "%s ~ C(%s, Sum('%s')) + " % (data_name, effect, targets[0]) +
            "+".join([" C(%s, Sum) "] * (len(effects) - 1)) %
            tuple(effects[1:]), data, groups=data[cov_name]).fit()

        # Retrieve the corresponding estimates
        this_betas = {}
        betas[effect] = this_betas
        this_pvalues = {}
        pvalues[effect] = this_pvalues
        this_conf_int = {}
        conf_int[effect] = this_conf_int
        for k in model.params.keys():
            # Remove "C("
            k_ = k[2:]
            if k_.startswith(effect):
                ename = k.split('[')[1][2:-1]
                this_betas[ename] = model.params[k]
                this_pvalues[ename] = model.pvalues[k]
                this_conf_int[ename] = \
                    model.conf_int()[1][k] - model.params[k]

        # Refit to get last target
        model = ols(
            "%s ~ C(%s, Sum('%s')) " % (data_name, effect, targets[1]) +
            "+".join([''] + [" C(%s, Sum) "] * (len(effects) - 1)) %
            tuple(effects[1:]), data).fit()
        key = "C(%s, Sum('%s'))[S.%s]" % (effect, targets[1], targets[0])
        this_betas[targets[0]] = model.params[key]
        this_pvalues[targets[0]] = model.pvalues[key]
        this_conf_int[targets[0]] = (model.conf_int()[1][key] -
            model.params[key])

    stop
    return betas, pvalues, conf_int
def analyze_effects(data, formula, model='ols', groups=None):
    """Measure the effects size of each categorical variables given in
    formula against dependent variable.

    Uses module smf.ols from stats model (statistics in Python)

    Parameters
    ----------
    data : pandas data frame

    formula : str
        Formula used in the specified model to fit model to the data.
        See documentation of statsmodels.formula.api or related examples.

    model : str, {'ols', 'mixedlm'}
        Imported from statsmodels.formula.api

    groups : str
        keyword argument passed to mixedlm model. Required specificall when
        you do mixedlm model tests.

    Returns
    -------
    model : instance of stats model whether 'ols' or 'mixedlm'
        fit() of the model on the data.
        print(model.summary()) to look at the summary of the fit() on data.
        params can be fetched as model.params
        pvalues can be fetched as model.pvalues
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("Given input 'data' should be like pandas Data frame."
                         " You provided {0}".format(data))

    if model not in ['ols', 'mixedlm']:
        raise ValueError("model={0} you specified is not implemented. "
                         "Choose between 'ols' or 'mixedlm'".format(model))

    if model == 'ols':
        model_fit = ols(formula=formula, data=data).fit()
    elif model == 'mixedlm':
        model_fit = mixedlm(formula=formula, data=data,
                            groups=groups).fit()

    return model_fit
예제 #6
0
    def test_formulas(self):

        np.random.seed(2410)
        exog = np.random.normal(size=(300,4))
        exog_re = np.random.normal(size=300)
        groups = np.kron(np.arange(100), [1,1,1])
        g_errors = exog_re * np.kron(np.random.normal(size=100),
                                     [1,1,1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)

        mod1 = MixedLM(endog, exog, groups, exog_re)
        rslt1 = mod1.fit()

        # Fit with a formula, passing groups as the actual values.
        df = pd.DataFrame({"endog": endog})
        for k in range(exog.shape[1]):
            df["exog%d" % k] = exog[:,k]
        df["exog_re"] = exog_re
        fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3"
        re_fml = "0 + exog_re"
        mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups=groups)
        rslt2 = mod2.fit()
        assert_almost_equal(rslt1.params, rslt2.params)

        # Fit with a formula, passing groups as the variable name.
        df["groups"] = groups
        mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml,
                                    groups="groups")
        rslt3 = mod3.fit(start_params=rslt2.params)
        assert_allclose(rslt1.params, rslt3.params, rtol=1e-4)

        # Check default variance structure with non-formula model
        # creation.
        exog_re = np.ones(len(endog), dtype=np.float64)
        mod4 = MixedLM(endog, exog, groups, exog_re)
        rslt4 = mod4.fit(start_params=rslt2.params)
        from statsmodels.formula.api import mixedlm
        mod5 = mixedlm(fml, df, groups="groups")
        rslt5 = mod5.fit(start_params=rslt2.params)
        assert_almost_equal(rslt4.params, rslt5.params)
예제 #7
0
        renames={
                'Session':{
                        'ofM':'naïve',
                        'ofMaF':'acute',
                        'ofMcF1':'chronic (2w)',
                        'ofMcF2':'chronic (4w)',
                        'ofMpF':'post',
                        },
                },
        )
plt.savefig('dr_activity.pdf')

import statsmodels.formula.api as smf
import numpy as np

model = smf.mixedlm("t ~ Session * treatment", subjectdf, groups=subjectdf["subject"])
fit = model.fit()
report = fit.summary()

print(report)
print(fit.params)
omnibus_tests = np.eye(len(fit.params))[1:-1]
omnibus_tests = omnibus_tests[:4]
omnibus_tests[0,6] = -1
omnibus_tests[1,7] = -1
omnibus_tests[2,8] = -1
omnibus_tests[3,9] = -1
print(omnibus_tests)
anova = fit.f_test(omnibus_tests)
print(anova)
예제 #8
0
    11: "Fall",
    12: "Winter"
}

AQ['season'] = AQ['month'].map(dic)
AQ = AQ.dropna()
AQ = AQ[AQ['pm2.5'] > 0]
AQ['pm25_log'] = np.log(AQ['pm2.5'])

AQ_cv = AQ[AQ['cbwd'] == 'cv']
AQ_cv = AQ_cv[(AQ_cv['pm25_log'] > 2.2) & (AQ_cv['pm25_log'] < 6.8)]

AQ_NE = AQ[AQ['cbwd'] == 'NE']
AQ_NE = AQ_NE[(AQ_NE['pm25_log'] > 0.5)]

AQ_NW = AQ[AQ['cbwd'] == 'NW']
AQ_NW = AQ_NW[(AQ_NW['pm25_log'] > 0.5)]

AQ_SE = AQ[AQ['cbwd'] == 'SE']
AQ_SE.sort_values(['pm25_log'], ascending=[False])
AQ_SE = AQ_SE[(AQ_SE['pm25_log'] > 0.5) & (AQ_SE['pm25_log'] < 6.291569)]

AQ_new = pd.concat([AQ_cv, AQ_NE, AQ_NW, AQ_SE])

mixed = smf.mixedlm("pm25_log ~ year+month+day+hour+DEWP+TEMP+PRES+Is+Ir",
                    AQ_new,
                    groups=AQ_new["cbwd"],
                    re_formula="~hour+PRES")
mixed_fit = mixed.fit()
print(mixed_fit.summary())
예제 #9
0
def create_df3(raw_data_location3):
    
    raw_data_location3 = open(r'C:\Users\danie\Documents\SURREY\Project_1\TaskSwitchingParadigm\online_TSP\second_online_cohort\pilot2_withoccurence.csv')
    path = (r'C:\Users\danie\Documents\SURREY\Project_1\TaskSwitchingParadigm\online_TSP\second_online_cohort')

    df = pd.read_csv(raw_data_location3, header = 0)

    df_behavstats = pd.DataFrame()
    df_behavstats1 = pd.DataFrame()   
    df_behavstats2 = pd.DataFrame()
    df_behavstats3 = pd.DataFrame()
    df_switch_type = pd.DataFrame()

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # LOOP WHICH CALCULATES AND CONCATS MAD, SD, MRT, MED
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    df.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True)
    df_switch_type = df
    df_rt = df

    for group_i, group_v in df_rt.groupby(level=[0, 1, 2]):
        group_v = group_v.apply(pd.to_numeric, errors = 'coerce').dropna(how = 'all')
        mask = group_v.index.get_level_values(2)

        mrt = group_v['response_time'].mean()
        SD = group_v['response_time'].std()
        med = group_v['response_time'].median()
        switchtrial0 = group_v['response_time'].iloc[0]

        ## The below can be used if you want to use more than the 1st switch trial to calculate switch cost
        # switchtrial1 = group_v['response_time'].iloc[1]
        # if n > 2:
        #     switchtrial2 = group_v['response_time'].iloc[2]

        group_v.at[group_i, 'mean_rt'] = mrt
        group_v.at[group_i, 'SD_rt'] = SD
        group_v.at[group_i, 'median_rt'] = med
        group_v.at[group_i, 'first_switch_rt'] = switchtrial0

        group_v.reset_index(drop = False, inplace = True)
        df_behavstats1 = pd.concat([df_behavstats1, group_v], sort=False) 

    df_behavstats1.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True)
    df_behavstats1.drop(df_behavstats1.columns[df_behavstats1.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)



    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # LOOP WHICH CALCULATES AND CONCATS SWITCH RT
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    for group_i, group_v in df_behavstats1.groupby(level=[0, 1, 2]):

        n = 0
        for index, row in group_v.iterrows():
            n =  n + 1
    
        # here dicates over how many trials the RT is averaged over (m), dependant on how many 
        # trials are in the overall group (n).
        ##
        # eg, when the number of overall trials in the group is less than 3 (if n < 3), then 
        # the number of trials to average over is 0 (m = 0), and the rows are left empty (np.nan).
        if n < 3:
            m = 0
            for index, row in group_v.iterrows():
                group_v.at[index, 'average_switch_rt'] = np.nan

        elif n >= 3 and n < 5:
            m = 2
        elif n >= 5:
            m = 3
    
        number_of_trials = 0
        overall_rt = 0
        # the 'islice' tells pandas to iterate with iterrows over the first 'm' rows, where 'm' is 
        # dictated above and depends on the overall number of trials, 'n', in the group.
        for index, row in islice(group_v.iterrows(), m):
            number_of_trials = number_of_trials + 1    
            overall_rt = overall_rt + row['response_time']     
            j = (overall_rt/number_of_trials)
            group_v.at[index, 'average_switch_rt'] = j
            
        group_v.reset_index(drop = True, inplace = False)
        df_behavstats = pd.concat([df_behavstats, group_v], sort=True)

    df_behavstats = pd.concat([df_behavstats, df_switch_type.reindex(columns=df.columns)], axis=1)
    df_behavstats = df_behavstats.drop(columns=['response_time'])
    df_behavstats.drop_duplicates(subset="mean_rt", keep='first', inplace=True)

    df_behavstats.drop(df_behavstats.columns[df_behavstats.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
 
    # when a group has less than 3 trials in it, the switch_rt is not calculated (m = 0). 
    # if there are NaN values in any of the rows of a column, that column returns NaN as a t-test 
    # value for any t-test calculations it is involved in. therefore i have excluded those rows below:
    print("")
    print("")
    print('BELOW DISPLAYS THE GROUP(S) WHICH HAVE BEEN EXCLUDED AS THERE WERE LESS THAN')
    print('3 TRIALS IN THE GROUP, CAUSING A NaN VALUE FOR THE T-TEST CALCULATIONS:')
    print("")
    print(df_behavstats[df_behavstats.isna().any(axis=1)].index)
    df_behavstats = df_behavstats[pd.notnull(df_behavstats['average_switch_rt'])]
    print("")
    print("")

    df_behavstats.reset_index(drop=False, inplace=True)


# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# SWITCH-TYPE COLUMN
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    df_behavstats = df_behavstats.loc[:,~df_behavstats.columns.duplicated()]
    df_behavstats.set_index(['auto_participant_id', 'occurence', 'type'], inplace = True)

    

    for group_index, group_value in df_behavstats.groupby(level=[0, 1]):
        group_value.reset_index(drop = False, inplace = True)
        row_iterator = group_value.iterrows()
        _, previous = next(row_iterator)
        for index, row in group_value.iterrows():

            if np.logical_and(row['changed'] == 1, index == 0):
                if row['type'] == 'ts-trial-digit-span':
                    j = 'NONE-DS'
                if row['type'] == 'ts-trial-spatial-span':
                    j = 'NONE-SS'
                if row['type'] == 'ts-trial-spatial-rotation':
                    j = 'NONE-SR'
                if row['type'] == '':
                    pass
            group_value.at[0, 'switch_type'] = j

        for index, row in row_iterator:
            j = 'none'
            if row['changed'] == 1:
                if row['type'] == 'ts-trial-digit-span' and previous['type'] == 'ts-trial-spatial-span':
                    j = 'SS-DS'
                if row['type'] == 'ts-trial-digit-span' and previous['type'] == 'ts-trial-spatial-rotation':
                    j = 'SR-DS'
                if row['type'] == 'ts-trial-spatial-span' and previous['type'] == 'ts-trial-digit-span':
                    j = 'DS-SS'
                if row['type'] == 'ts-trial-spatial-span' and previous['type'] == 'ts-trial-spatial-rotation':
                    j = 'SR-SS'
                if row['type'] == 'ts-trial-spatial-rotation' and previous['type'] == 'ts-trial-digit-span':
                    j = 'DS-SR'
                if row['type'] == 'ts-trial-spatial-rotation' and previous['type'] == 'ts-trial-spatial-span':
                    j = 'SS-SR'
                if row['type'] == '' and previous['type'] == 'ts-trial-spatial-span':
                    pass
                if row['type'] == '' and previous['type'] == 'ts-trial-spatial-rotation':
                    pass
                if row['type'] == '' and previous['type'] == 'ts-trial-digit-span':
                    pass

                previous = row

            # group_value.reset_index(drop = True, inplace = True)
            group_value.at[index, 'switch_type'] = j
        
        df_behavstats = pd.concat([df_behavstats, group_value], sort=True)
        df_behavstats = df_behavstats.dropna(subset=['switch_type'])
        df_behavstats.to_csv('WithSwitchType.csv')


    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # LMEM
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    md1 = smf.mixedlm("first_switch_rt ~ auto_participant_id ", df_behavstats, groups=df_behavstats["type"])
    mdf1 = md1.fit()
    print('*************************************************************************************')
    print('LINEAR MIXED EFFECTS MODELS')
    print(mdf1.summary())

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # ANOVAs
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    model = ols(
        'first_switch_rt ~ +C(type)+C(auto_participant_id)+C(switch_type)+C(occurence)+C(occurence):C(type)+C(occurence):C(auto_participant_id)+C(occurence):C(switch_type)+C(type):C(switch_type)+C(auto_participant_id):C(switch_type)+C(type):C(auto_participant_id)',
        data=df_behavstats
        ).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    print('*************************************************************************************')
    print('ANOVA TABLE FIRST SWITCH RT')
    print(anova_table)


    model1 = ols(
        'mean_rt  ~ +C(type)+C(auto_participant_id)+C(switch_type)+C(occurence)+C(occurence):C(type)+C(occurence):C(auto_participant_id)+C(occurence):C(switch_type)+C(type):C(switch_type)+C(auto_participant_id):C(switch_type)+C(type):C(auto_participant_id)',
        data=df_behavstats
        ).fit()

    anova_table1 = sm.stats.anova_lm(model1, typ=2)
    print('*************************************************************************************')
    print('ANOVA TABLE MEAN RT')
    print(anova_table1)

    model2 = ols(
        'median_rt  ~ +C(type)+C(auto_participant_id)+C(switch_type)+C(occurence)+C(occurence):C(type)+C(occurence):C(auto_participant_id)+C(occurence):C(switch_type)+C(type):C(switch_type)+C(auto_participant_id):C(switch_type)+C(type):C(auto_participant_id)',
        data=df_behavstats
        ).fit()

    anova_table2 = sm.stats.anova_lm(model2, typ=2)
    print('*************************************************************************************')
    print('ANOVA TABLE MEDIAN SWITCH RT')
    print(anova_table2)


    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # T-TESTS
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    mean   = df_behavstats['mean_rt'] 
    SD     = df_behavstats['SD_rt'] 
    median = df_behavstats['median_rt'] 

    # Check here is mean or median is different from one another; if so, decide which to use. If not, move ahead with one or the other. 
    g1 = stats.ttest_ind(median, mean, equal_var = False) 
    print('*************************************************************************************')
    print('TTEST for difference between mean and median rt: All tasks, all occurences =', g1)
    
    rt1    = df_behavstats['first_switch_rt']
    rt123  = df_behavstats['average_switch_rt']

    f1 = stats.ttest_rel(rt1, rt123)
    print('*************************************************************************************')
    print('TTEST for difference between first and average rt: All tasks, all occurences =', f1)



    df_behavstats.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True)
    
    for group_i, group_v in df_behavstats.groupby(level=[1,2]):
        group_v.reset_index(drop = False, inplace = True)
        for index, row in group_v.iterrows():
                task = group_v['type'].loc[1]
                occurence = group_v['occurence'].loc[1]
                SRT = group_v['first_switch_rt']
                MRT = group_v['average_switch_rt']
                n = len(MRT)
                x = range(0,n,1)
                ttest = stats.ttest_rel(MRT, SRT)
        print('*************************************************************************************')
        print('TASK TYPE=', task, 'OCCURENCE =', occurence)
        print('TTEST BETWEEN FIRST AND AVERAGE RT=', ttest)
        
        fig, axMRT = plt.subplots()
        color = 'tab:red'
        axMRT.set_xlabel('Number of trials')
        axMRT.set_ylabel('Mean RT', color=color)
        axMRT.plot(x, MRT, color=color)
        axMRT.set_ylim([0,3000])
        axMRT.tick_params(axis='y')

        axSRT = axMRT.twinx()  # instantiate a second axes that shares the same x-axis
        color= 'tab:blue'
        axSRT.set_ylabel('Switch RT', color=color)  # we already handled the x-label with ax1
        axSRT.plot(x, SRT, color=color)
        axSRT.set_ylim([0,3000])
        axSRT.tick_params(axis='y')

        t = str(task)
        o = str(occurence)
        name = 'Figures/ScatterPlot for' + t + 'Occurence=' + o +'.jpeg'
        plt.legend(loc='upper left');
        axSRT.text(0, 10, name, bbox={'facecolor': 'wheat', 'alpha': 0.5, 'pad': 10})
        fig.tight_layout()  # otherwise the right y-label is slightly clipped
        fig.savefig(name, dpi=400)








    df_behavstats.reset_index(drop = False, inplace = True)



    # # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # # Testing for learning effects
    # # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    # df_behavstats.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True)
    
    # for group_i, group_v in df_behavstats.groupby(level=[1,2]):
    #     group_v.reset_index(drop = False, inplace = True)
    #     last_occ = []
    #     first_occ = []
    #     for index, row in group_v.iterrows():
    #             if group_v['occurence'].loc[1] == 8 :
    #                 last_occ = group_v['average_switch_rt']
    #                 print('last_occ',last_occ)
    #             elif group_v['occurence'].loc[1] == 0 :
    #                 first_occ = group_v['average_switch_rt']
    #                 print('first_occ',first_occ)
    #             else:
    #                 continue
    #             task = group_v['type'].loc[1]
    #             occurence = group_v['occurence'].loc[1]
    #             ttest = stats.ttest_rel(last_occ, first_occ)
    #     print('*************************************************************************************')
    #     print('TASK TYPE=', task, 'OCCURENCE =', occurence)
    #     print('TTEST BETWEEN FIRST LAST AVERAGE RT=', ttest)
    # df_behavstats.reset_index(drop = False, inplace = True)


    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # Plots!
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    ax1 = sns.boxplot(x='type', y='mean_rt', data=df_behavstats)
    ax1 = sns.swarmplot(x='type', y='mean_rt', data=df_behavstats, color=".25")
    figure1 = ax1.get_figure()    
    figure1.savefig('Figures/boxplot_Mean_ShowDataPoints.png', dpi=400)
    plt.close()

    ax2 = sns.boxplot(x='type', y='mean_rt', hue='occurence', data=df_behavstats)
    figure2 = ax2.get_figure()    
    figure2.savefig('Figures/boxplot_Mean_byTaskType.png', dpi=400)
    plt.close()

    ax3 = sns.boxplot(x='type', y='first_switch_rt', data=df_behavstats)
    ax3 = sns.swarmplot(x='type', y='first_switch_rt', data=df_behavstats, color=".25")
    figure3 = ax3.get_figure()    
    figure3.savefig('Figures/boxplot_Switch_ShowDataPoints.png', dpi=400)
    plt.close()

    ax4 = sns.boxplot(x='type', y='first_switch_rt', hue='occurence', data=df_behavstats)
    figure4 = ax4.get_figure()    
    figure4.savefig('Figures/boxplot_Switch_byTaskType.png', dpi=400)
    plt.close()

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # Write ttests to a .csv file
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    # a = stats.ttest_ind(mean, rt1)
    # MEANvsAVRT = stats.ttest_ind(mean, rt123)

    # d = stats.ttest_ind(median, rt1)
    # MEDvsAVRT = stats.ttest_ind(median, rt123) 

    # standard_t_tests = [a,MEANvsAVRT,d,MEDvsAVRT]

    # a1 = stats.ttest_ind(mean, rt1, equal_var = False)
    # MEANvsAVRT1 = stats.ttest_ind(mean, rt123, equal_var = False)

    # d1 = stats.ttest_ind(median, rt1, equal_var = False)
    # MEDvsAVRT1 = stats.ttest_ind(median, rt123, equal_var = False) 

    # welchs_t_tests = [a1,MEANvsAVRT1,d1,MEDvsAVRT1]


    # t_data = {'standard':standard_t_tests, 'welchs':welchs_t_tests}
    # t_rows = ['mean_vs_rt1', 'mean_vs_rt123', 'med_vs_rt1', 'med_vs_rt123']

    # df_t_tests = pd.DataFrame(data=t_data, index=t_rows)
    # name='TTests.csv'
    # dest = os.path.join(path, name)
    # df_t_tests.to_csv(dest)


    return df_behavstats
예제 #10
0
# Drop any other NaNs
data = data.dropna()

# Let's model some stuff! Treat the different CASA versions as random mixed
# effect

data["CASAVer"][data["CASAVer"] == 440] = 0
data["CASAVer"][data["CASAVer"] == 453] = 1
data["CASAVer"][data["CASAVer"] == 460] = 2
# data["CASAVer"][data["CASAVer"] == 470] = 3

# Create a version without any diverging cleans
good_data = data[data["peak_res"] < 0.01]

# Sum
sum_model = sm.mixedlm("sum ~ Tclean*AllFields*MScale*Mask*Model", data=data,
                       groups=data["CASAVer"]).fit(reml=False)
print(sum_model.summary())

# Can't use Tclean. Makes matrix singular.
sum_model_good = sm.mixedlm("sum ~ AllFields*MScale*Mask*Model", data=good_data,
                            groups=good_data["CASAVer"]).fit(reml=False)
print(sum_model_good.summary())
# Dominated by model (duh)

# Median
median_model = \
    sm.mixedlm("median ~ Tclean*AllFields*MScale*Mask*Model", data=data,
               groups=data["CASAVer"]).fit(reml=False)
print(median_model.summary())

# Can't use Tclean. Makes matrix singular.
예제 #11
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 30 16:28:13 2016

@author: emg
"""

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

df = pd.read_csv('/Users/emg/Programmming/GitHub/dissertation/data_handling/ranked_data.csv', index_col=0)
df = df[df['author'] != '[deleted]']
df = df[df['author'] != '#NAME?']
df = df[df['author'] != 'AutoModerator']
df = df[df['author'] != 'AskScienceModerator']
df['count'] = 1

sample = df[df['subreddit']=='AskAnthropology']

md = smf.mixedlm('score ~ rank', df, groups=df['author'], re_formula='~rank')

mdf = md.fit()

print(mdf.summary())
예제 #12
0
    # 03 Build linear models
    if Group == 'Healthy':
        HealthySystem = LinearSystem.copy()
    elif Group == 'OI':
        OISystem = LinearSystem.copy()

## Standard linear models
Healthy_LM = smf.ols("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1",
                     data=HealthySystem).fit()
OI_LM = smf.ols("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1",
                data=OISystem).fit()

## Linear mixed-effect models
Healthy_LMM = smf.mixedlm("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1",
                          data=HealthySystem,
                          groups=HealthySystem['Scan'],
                          vc_formula={
                              "IF": "IF-1"
                          }).fit(reml=True)
OI_LMM = smf.mixedlm("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1",
                     data=OISystem,
                     groups=OISystem['Scan'],
                     vc_formula={
                         "IF": "IF-1"
                     }).fit(reml=True)

## Likelihood ratio test
Healthy_p = LikelihoodRatioTest(Healthy_LM, Healthy_LMM, 1)
print('p value of LRT for healthy group: ' + str(Healthy_p))
OI_p = LikelihoodRatioTest(OI_LM, OI_LMM, 1)
print('p value of LRT for OI group: ' + str(OI_p))
예제 #13
0
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf

np.random.seed(1234)
N = 1000
nGroups = 40
nPerGroup = N//nGroups
x = np.random.normal(size=N)
group = np.repeat(np.arange(0, nGroups), nPerGroup)
ranEff = np.random.normal(scale=.5, size=nGroups)
coefs = np.array([2,.2]).reshape(2,1)
randInts = coefs[0] + ranEff[group]

X = np.array([randInts, x]).T

y = randInts + x*coefs[1] +  np.random.normal(scale=.75, size=N)
y.shape

df = pd.DataFrame({'x':x, 'group':group, 'y':y})

### Mode
mod0 = smf.mixedlm('y ~ x', data=df, groups=df['group']) 
mod = mod0.fit() 
print(mod.summary())  # intercept RE is in sd 

reModel = mod.random_effects['Intercept'][:]
sm.qqplot(reModel, sp.stats.norm, line='45', fit=True, scale=.25)
예제 #14
0
def coefplot(formula, data, intercept=False, ci=95, min_tvalue=2,
             mixed_effect="Cube", sig_color='r', nonsig_color='k',
             add_legend=False, figsize=None):
    """Plot the coefficients from a linear model.

    Parameters
    ----------
    formula : string
        patsy formula for ols model
    data : dataframe
        data for the plot; formula terms must appear in columns
    groupby : grouping object, optional
        object to group data with to fit conditional models
    intercept : bool, optional
        if False, strips the intercept term before plotting
    ci : float, optional
        size of confidence intervals
    palette : {seaborn color palette, color}, optional
        palette for the horizonal plots or a single color

    """

    try:
        import statsmodels.formula.api as sf
        _has_statsmodels = True
    except ImportError:
        _has_statsmodels = False

    if not _has_statsmodels:
        raise ImportError("The `coefplot` function requires statsmodels")

    import pandas as pd

    alpha = 1 - ci / 100
    if mixed_effect is None:
        model = sf.ols(formula, data).fit()
    else:
        model = sf.mixedlm(formula, data,
                           groups=data[mixed_effect]).fit(reml=False)
    coefs = model.params

    # Order by term order. This should be a default in statsmodels IMO.
    ind = list(coefs.index)
    ind.sort(key=lambda x: x.count(":"))
    ind = pd.Index(ind)
    coefs = coefs[ind]

    cis = model.conf_int(alpha).T[ind].T
    tvals = np.abs(model.tvalues)[ind]
    model_effects = ind

    # Possibly ignore the intercept
    if not intercept:
        coefs = coefs.ix[model_effects != "Intercept"]
        cis = cis.ix[model_effects != "Intercept"]
        tvals = tvals.ix[model_effects != "Intercept"]
        model_effects = model_effects[model_effects != "Intercept"]

        if mixed_effect is not None:
            coefs = coefs.ix[model_effects != "Intercept RE"]
            cis = cis.ix[model_effects != "Intercept RE"]
            tvals = tvals.ix[model_effects != "Intercept RE"]
            model_effects = model_effects[model_effects != "Intercept RE"]

    n_terms = len(coefs)

    rep_name = {'fc': "F", "pb": r'$\beta$', 'm': r'$\mathcal{M}$',
                'k': r'$k$', 'sf': r'$\zeta$',
                'vp': r'$\alpha$'}

    if figsize is None:
        w, h = mpl.rcParams["figure.figsize"]
        hsize = lambda n: n * (h / 2)
        wsize = lambda n: n * (w / (4 * (n / 5)))
        figsize = (hsize(1.5), wsize(n_terms))

    fig, ax = p.subplots(1, 1, figsize=figsize)
    for i, term in enumerate(coefs.index):
        if tvals[term] < min_tvalue:
            color = nonsig_color
            symbol = '^'
        else:
            color = sig_color
            symbol = 'o'

        low, high = cis.ix[term]
        ax.plot([low, high], [i, i], c=color,
                solid_capstyle="round", lw=2.5)
        ax.plot(coefs.ix[term], i, symbol, c=color, ms=8)
        # ax.plot([i, i], [low, high], c=color,
        #         solid_capstyle="round", lw=2.5)
        # ax.plot(i, coefs.ix[term], symbol, c=color, ms=8)
    ax.set_ylim(-.5, n_terms - .5)
    ax.set_xlabel("Coefficent Values")
    ax.axvline(0, ls="--", c="dimgray")
    ax.grid(True)

    # Rotate x ticks labels
    for label in ax.get_xticklabels():
        label.set_rotation(90)

    # Change to the nice parameter labels
    altered_labels = []
    for mod in model_effects:
        if ":" in mod:
            new_mod = []
            for param in mod.split(":"):
                new_mod.append(rep_name[param])
        else:
            new_mod = [rep_name[mod]]
        altered_labels.append(":".join(new_mod))

    p.yticks(np.arange(len(model_effects)), altered_labels, rotation=0)

    if add_legend:
        # Add in a legend with the symbols wrt min_tval
        sig_artist = p.Line2D((0, 1), (0, 0), color=sig_color, marker='o',
                              linestyle='-')
        nonsig_artist = p.Line2D((0, 1), (0, 0), color=nonsig_color,
                                 marker='^',
                                 linestyle='-')

        ax.legend([sig_artist, nonsig_artist],
                  [r"$t$-value > {0:.2f}".format(min_tvalue),
                   r"$t$-value < {0:.2f}".format(min_tvalue)], frameon=True,
                  loc='best')

    p.tight_layout()
예제 #15
0
# comparison.
#
# Here are our import statements:

# ## Growth curves of pigs
#
# These are longitudinal data from a factorial experiment. The outcome
# variable is the weight of each pig, and the only predictor variable we
# will use here is "time".  First we fit a model that expresses the mean
# weight as a linear function of time, with a random intercept for each pig.
# The model is specified using formulas. Since the random effects structure
# is not specified, the default random effects structure (a random intercept
# for each group) is automatically used.

data = sm.datasets.get_rdataset('dietox', 'geepack').data
md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])
mdf = md.fit()
print(mdf.summary())

# Here is the same model fit in R using LMER:

# Note that in the Statsmodels summary of results, the fixed effects and
# random effects parameter estimates are shown in a single table.  The
# random effect for animal is labeled "Intercept RE" in the Statmodels
# output above.  In the LME4 output, this effect is the pig intercept under
# the random effects section.
#
# There has been a lot of debate about whether the standard errors for
# random effect variance and covariance parameters are useful.  In LME4,
# these standard errors are not displayed, because the authors of the
# package believe they are not very informative.  While there is good reason
### Only kelz vs lisu
# AQ_new = AQ_new[np.logical_and(np.isin(AQ_new['system1'],['kelz','lisu']),np.isin(AQ_new['system2'],['kelz','lisu']))]

### Only musicians
# goldmsi_med = np.median(AQ_new['goldmsi'])
# AQ_new = AQ_new[AQ_new['goldmsi']>goldmsi_med]

## Co-dependent variables: *
## Random variables: +(var/user_id) --> /user_id
## Also check multiple regression (simple)
## Also check multiple logistic regression

## Mixed Effects Model
mixed = smf.mixedlm(
    "difficulty ~ f_diff+f_system2+f_1_2+goldmsi+recognised+answer",
    AQ_new,
    groups='question_id')
mixed_fit = mixed.fit()
print(mixed_fit.summary())

### logistic regression
# feature_columns = ["recognised","difficulty","f_system1","f_system2","goldmsi"]
# X = AQ_new.loc[:, feature_columns].values
#
# y=AQ_new.answer
# clf = LogisticRegression().fit(X, y)
# print clf.coef_
# for c,feat in zip(clf.coef_[0],feature_columns):
#     print feat, 'coef', c

###############################################################
예제 #17
0
        renames={
                'Session':{
                        'ofM':'naïve',
                        'ofMaF':'acute',
                        'ofMcF1':'chronic (2w)',
                        'ofMcF2':'chronic (4w)',
                        'ofMpF':'post',
                        },
                },
        )
plt.savefig('drs_activity_full.pdf')

import statsmodels.formula.api as smf
import numpy as np

model = smf.mixedlm("t ~ Session * treatment", subjectdf, groups=subjectdf["subject"])
fit = model.fit()
report = fit.summary()

print(report)
print(fit.params)
omnibus_tests = np.eye(len(fit.params))[1:-1]
omnibus_tests = omnibus_tests[:4]
omnibus_tests[0,6] = -1
omnibus_tests[1,7] = -1
omnibus_tests[2,8] = -1
omnibus_tests[3,9] = -1
print(omnibus_tests)
anova = fit.f_test(omnibus_tests)
print(anova)
예제 #18
0
# the dPTE label connections to look at
columns = ["Subject","Emo","Connection","dPTE"]
emos = ["neg","pos"]
conxs = ["LO-IP","LO-IT","LO-SM","LO-TT","IP-IT",
         "IP-SM","IP-TT","IT-SM","IT-TT","SM-TT"]

# separate LMMs for each connection

# setup LMM
# for Alpha band
print("Doing stats on ALPHA dPTE connections")
for conx in conxs:
    print("Calculating LMM for connection:  {}".format(conx))
    df = df_NEM_dPTE_alpha[df_NEM_dPTE_alpha.Connection == conx].infer_objects()
    try:
        res_0 = smf.mixedlm('dPTE ~ 1', data=df, groups=df['Subject']).fit(reml=False)
        print("Null model:")
        print(res_0.params)
        print(res_0.summary())
        print("AIC: {}".format(res_0.aic))
    except:
        print("Null Model could not converge...")
        continue
    try:
        res_emo = smf.mixedlm('dPTE ~ Emo', data=df, groups=df['Subject']).fit(reml=False)
    except:
        print("Emotion Model could not converge...")
        continue
    print("Emotion model:")
    print(res_emo.params)
    print(res_emo.summary())

# In[19]:

print 'Dimensions with missing dependent variable removed: %d rows, %d columns' %  (child_study_data_no_na.shape[0], child_study_data_no_na.shape[1])


# In[20]:

# fitting a model without repeated measures (i.e., no random slopes)
# because I cannot entirely  figure out what Statsmodels is doing
# with the random effects structure

# also notice that Statsmodels does not fit quite as quickly as lme4

m100_lmm = smf.mixedlm('M100LatCorr ~ Hem + Cond + Case + Site + Age_Calc ',                        data = child_study_data_no_na, groups = child_study_data_no_na['Subject'])


m100_lmm_fit = m100_lmm.fit()


# And now to inspect the output:

# In[21]:

print m100_lmm_fit.summary()


# ##Calling R from Python
# The `rpy2` package allows for calling R directly from Python. Let's use it to see if we get similar results for the coefficients and how the summary output differs. The cells below illustrate how to do this directly in Python (with or without a notebook).   
# 
            linewidth=2)

# Plotting all Groups in one graph
Legend = sns.barplot(y='Score',
                     x='Group',
                     data=MMMM,
                     palette="colorblind",
                     hue='Condition')
Legend.legend(loc='upper center',
              bbox_to_anchor=(1.45, 0.8),
              shadow=True,
              ncol=1)
plt.show()

# Model for L1 Spanish
md = smf.mixedlm("Score ~ Tense*Type", SMMM, groups=SMMM["Group"])
mdf1 = md.fit(method="cg")
print(mdf1.summary())
# Model for L1 Croatian
md = smf.mixedlm("Score ~ Tense*Type", CMMM, groups=CMMM["Group"])
mdf2 = md.fit(method='nm')
print(mdf2.summary())
# Model for L1 German
md = smf.mixedlm("Score ~ Tense*Type", GMMM, groups=GMMM["Group"])
mdf3 = md.fit(method='nm')
print(mdf3.summary())
# Model for ALL
md = smf.mixedlm("Score ~ Tense*Type*Group", MMMM, groups=MMMM["Group"])
mdf = md.fit(method='cg')
print(mdf.summary())
예제 #21
0
            # generate the data
            if (root / f'data{n}_{J}.pkl').is_file():
                with open(root / f'data{n}_{J}.pkl', 'rb') as f:
                    data, xijs, yijs = pickle.load(f)
            else:
                yijs, xijs = gendata(n, J)
                data = None
            datadf = arrtodf(yijs, xijs)
            cutoffs1 = np.linspace(0, 2, 21)
            cutoffs2 = np.linspace(0.05, 1, 20)
            burnin = 5000
            num = 50000
            numboot = 1000

            # frequentist method, do the regression
            md = smf.mixedlm("Y~1+X", datadf, groups=datadf["group"])
            mdf = md.fit()
            tauhat, sigmahat = np.sqrt(mdf.cov_re.iloc[0,
                                                       0]), np.sqrt(mdf.scale)
            beta0hat, beta1hat = mdf.fe_params['Intercept'], mdf.fe_params['X']
            f1pvalues = frepvaluebeta(mdf, cutoffs1)

            # frequentist method, bootstrap, for tau
            if (root / f'bootstrap{n}_{J}_{numboot}.pkl').is_file():
                with open(root / f'bootstrap{n}_{J}_{numboot}.pkl', 'rb') as f:
                    taubtps = pickle.load(f)
            else:
                taubtps = tauparabootstrap(
                    n, J, [beta0hat, beta1hat, sigmahat, tauhat], xijs,
                    numboot)
            f2pvalues = frepvaluetau(taubtps, cutoffs2)
예제 #22
0
            row['MSI'] = MSI[i]
            row['sync'] = sync[i]
            row['hemi'] = hemis.index(hemi)
            row['condition'] = subset.index(condition)
            row['interaction'] = row['sync'] * row['condition']
            for tt in range(161):
                row['data_tt%s' % (tt)] = scores[i][tt]
            rows_list.append(row)

df = pd.DataFrame(rows_list)

# hemi % condition
coef_bin = np.zeros([161, 4])
for tt in range(161):
    md = smf.mixedlm('data_tt%s ~ hemi * condition' % (tt),
                     df,
                     groups=df['subject_number'],
                     re_formula="~hemi*condition")
    mdf = md.fit()
    # add intercept, and first 3 factor coefs to the bin
    coef_bin[tt, :] = mdf.params[0:4]

var_labels = ['intercept', 'hemi', 'condition', 'interaction']
times = np.linspace(-200, 600, 161)
for coef, lab in zip(coef_bin.T, var_labels):
    plt.plot(times, coef, label=lab)
plt.legend()
plt.show()

#____________________________________________________________
# y-preds compare accuracy
예제 #23
0
N, nfixed = np.shape(X)
_, nrandm = np.shape(Z)
# generate data
w0 = [5.0, 1.0, 2.0, 8.0, 1.0, 1.0] + np.random.randn(6)
#w0 -= np.mean(w0)
#w0 = np.random.normal(size=(M,))
z0 = np.random.normal(size=(N1, )) * 10

Pheno = np.dot(X, w0) + np.dot(Z, z0) + Y.flatten()
beta0 = np.linalg.lstsq(X, Pheno)

fixedpred = np.argmax(X, axis=1)
randmpred = np.argmax(Z, axis=1)

tbltest['Pheno'] = Pheno
md = smf.mixedlm("Pheno ~ Condi1*Condi2", tbltest, groups=tbltest["subj"])
mdf = md.fit()
Y = np.expand_dims(Pheno, axis=1)

fitted = mdf.fittedvalues

fe_params = pd.DataFrame(mdf.fe_params, columns=['LMM'])
fe_params.index = Terms
random_effects = pd.DataFrame(mdf.random_effects)
random_effects = random_effects.transpose()
random_effects = random_effects.rename(index=str, columns={'groups': 'LMM'})
#%% Real data
Tbl_beh = pd.read_csv('./behavioral_data.txt', delimiter='\t')
Tbl_beh["subj"] = Tbl_beh["subj"].astype('category')
tbltest = Tbl_beh
formula = "rt ~ group*orientation*identity"
예제 #24
0
WRKY = pd.read_csv("ath_wrky_scores_length_list.txt", header=None)
WRKY.rename(columns={0: "ds", 1: "length"}, inplace=True)
WRKY["family"] = "WRKY"

MYB = pd.read_csv("ath_myb_scores_length_list.txt", header=None)
MYB.rename(columns={0: "ds", 1: "length"}, inplace=True)
MYB["family"] = "MYB"

all_lengths = ap2.append(WRKY)
all_lengths = all_lengths.append(MYB)

all_family_df = all_exonnumbers.merge(all_lengths, on=["ds", "family"])
all_family_df.to_csv("families_data.csv")

md = smf.mixedlm("ds ~ exon_number+length",
                 all_family_df,
                 groups=all_family_df["family"])

mdf = md.fit()
print(mdf.summary())

#---- ap2
ap2_df = all_family_df.query("family == 'ap2'").copy()

md = smf.mixedlm("ds ~ exon_number+length", ap2_df, groups=ap2_df["family"])

mdf = md.fit()
print(mdf.summary())

#---- wrky
wrky_df = all_family_df.query("family == 'WRKY'").copy()
예제 #25
0
        'mep_category_percentile', 'mep_size'
    ],
                    axis=1), df_powers2['mep_category_absolute_binary'])
model = LinearRegression().fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred = y_pred > 0.5
acc = accuracy_score(y_test, y_pred)
acc = {
    'sub': 'all including sub variable',
    'accuracy': acc,
    'r2': model.score(x_test, y_test)
}
accuracy_list.append(acc)
print(acc)
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)
plt.xlabel('Target')
plt.ylabel('Prediction')

#%%
md = smf.mixedlm("mep_category_absolute_binary ~ power_beta",
                 df_powers[[
                     'power_alpha', 'power_beta', 'power_gamma',
                     'power_beta_gamma', 'power_b2a', 'power_g2a',
                     'power_bg2a', 'sub', 'mep_category_absolute_binary'
                 ]],
                 groups=df_powers["sub"])
mdf = md.fit()
print(mdf.summary())

#%%
    step = pm.NUTS(model.vars, scaling=start)

with model:
    trace = pm.sample(3000, step, start)

#%%
pm.traceplot(trace)
dftmp = pm.df_summary(trace, varnames=['group_effects'])
print(dftmp['mean'])
import statsmodels.formula.api as smf
# from patsy import dmatrices
import pandas as pd
tbl = pd.DataFrame(predictors, columns=['C1', 'C2', 'C3'])
tbl['group'] = pd.Series(group, dtype="category")
tbl['yd'] = y
md2 = smf.mixedlm("yd ~ -1 + C1 + C2 + C3", tbl, groups=tbl["group"])
mdf2 = md2.fit()
print(mdf2.summary())
#%%
X = np.tile(group_predictors[group], (1, 3)) * predictors
beta0 = np.linalg.lstsq(X, y)
fitted = np.dot(X, beta0[0])

import matplotlib.pyplot as plt
plt.figure()
plt.plot(y, 'k')
plt.plot(fitted, 'g')

dftmp = pm.df_summary(trace[1000:], varnames=['mu_est'])
testdf = np.asarray(dftmp['mean'])
plt.plot(testdf, 'r')
예제 #27
0
    def _train(self, X, y):
        # Initialize the output
        mapping = {}

        # Estimate target type, if necessary
        if self.binomial_target is None:
            if len(y.unique()) <= 2:
                binomial_target = True
            else:
                binomial_target = False
        else:
            binomial_target = self.binomial_target

        # The estimation does not have to converge -> at least converge to the same value.
        np.random.seed(2001)

        for switch in self.ordinal_encoder.category_mapping:
            col = switch.get('col')
            values = switch.get('mapping')
            data = self._rename_and_merge(X, y, col)

            try:
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore")
                    if binomial_target:
                        # Classification, returns (regularized) log odds per category as stored in vc_mean
                        # Note: md.predict() returns: output = fe_mean + vcp_mean + vc_mean[category]
                        md = bgmm.from_formula('target ~ 1', {
                            'a': '0 + C(feature)'
                        }, data).fit_vb()
                        index_names = [
                            int(
                                float(
                                    re.sub(r'C\(feature\)\[(\S+)\]', r'\1',
                                           index_name)))
                            for index_name in md.model.vc_names
                        ]
                        estimate = pd.Series(md.vc_mean, index=index_names)
                    else:
                        # Regression, returns (regularized) mean deviation of the observation's category from the global mean
                        md = smf.mixedlm('target ~ 1',
                                         data,
                                         groups=data['feature']).fit()
                        tmp = dict()
                        for key, value in md.random_effects.items():
                            tmp[key] = value[0]
                        estimate = pd.Series(tmp)
            except np.linalg.LinAlgError:
                # Singular matrix -> just return all zeros
                estimate = pd.Series(np.zeros(len(values)), index=values)

            # Ignore unique columns. This helps to prevent overfitting on id-like columns
            if len(X[col].unique()) == len(y):
                estimate[:] = 0

            if self.handle_unknown == 'return_nan':
                estimate.loc[-1] = np.nan
            elif self.handle_unknown == 'value':
                estimate.loc[-1] = 0

            if self.handle_missing == 'return_nan':
                estimate.loc[values.loc[np.nan]] = np.nan
            elif self.handle_missing == 'value':
                estimate.loc[-2] = 0

            mapping[col] = estimate

        return mapping
예제 #28
0
def execute_glm(merneuro, int_cols, areas, formula, re_f):
    """Execute GLM

    Input:
    --------
    merneuro: pd.DataFrame
        shape: linguistiq features (_conv, _part and _diff for each features) + brain areas as columns; sessions as rows
    int_cols: list
        list of strings, interest columns name, prgram argument "functions"
    areas: list
        list of areas in the neuro file. extracted before renaming occured
    formula: str
        raw formula for smf.mixedlm()
    re_f: str
        re_formula for smf.mixedlm()
    
    Output:
    --------
    pvalues: dict
        contains models pvalues, shape {'int_col': {'formula': np.array}}
    estimates: dict
        contains models estimates, shape {'int_col': {'formula': np.array}}
    """
    import warnings
    from statsmodels.tools.sm_exceptions import ConvergenceWarning  # logging errors: either ConvergenceWarning or RuntimeWarning
    # saving in
    pvalues = {}
    estimates = {}

    for c in int_cols:
        print(c)
        p_c_dic = {}
        e_c_dic = {}
        for formula_part in ['_part', '_conv', '_diff']:
            int_cols = ['Intercept', c + formula_part] + ([
                'Agent[T.R]', c + formula_part + ':Agent[T.R]'
            ] if re.search('Agent', formula) is not None else [])
            start_time = time.time()
            print('\t', formula_part)
            p_f_dic = []
            e_f_dic = []
            for ar in areas:
                formula_1 = formula.format(str(ar).zfill(3), c + formula_part)
                print(formula_1)
                md = smf.mixedlm(formula_1,
                                 merneuro,
                                 groups=merneuro["locutor"],
                                 re_formula=re_f)
                with warnings.catch_warnings(record=True) as w:
                    mdf = md.fit()
                # Add warnings to model data
                p_to_dic = mdf.pvalues[int_cols]
                p_to_dic['Warning'] = None if len(w) == 0 else str(
                    w[-1].category).replace("<class '", '').replace(
                        "'>", '').split('.')[-1]
                e_to_dic = mdf.fe_params
                e_to_dic['Warning'] = None if len(w) == 0 else str(
                    w[-1].category).replace("<class '", '').replace(
                        "'>", '').split('.')[-1]
                # Add to dic - no need to add "area" bc continuous set of areas, starting at 0 (control)
                p_f_dic.append(p_to_dic)
                e_f_dic.append(e_to_dic)
            p_c_dic[formula_part] = pd.DataFrame(p_f_dic)
            e_c_dic[formula_part] = pd.DataFrame(e_f_dic)
            print("\tElapsed: {0:4.2f}".format(time.time() - start_time))
        pvalues[c] = p_c_dic
        estimates[c] = e_c_dic
    metadata = {
        'pvalues':
        [c.replace(formula_part, '{}') for c in p_c_dic[formula_part].columns],
        'estimates':
        [c.replace(formula_part, '{}') for c in e_c_dic[formula_part].columns]
    }

    return pvalues, estimates, metadata
예제 #29
0
#        for treatment in treatment_order[1:]:
#            comparison_group = data_to_compare[data_to_compare.treatment==treatment]
#            pvalue = scipy.stats.ttest_ind(control_group[var], comparison_group[var]).pvalue
#            result_table.append(dict(genotype=geno, variable=var, control_vs=treatment, pvalue=pvalue))
#result_table=pd.DataFrame(result_table)
#result_table['pvalue_adjusted']=sm.stats.multipletests(result_table.pvalue, method='hommel')[1]
#print(result_table)
#%% More advanced stats
import statsmodels.formula.api as smf
import statsmodels.api as sm

# Test the counts of long_bouts per fish in each treatment group
lm = smf.glm(
    formula=
    f'long_bouts ~ C(treatment, Treatment(reference="{treatment_order[0]}"))*C(genotype, Treatment(reference="{genotype_order[0]}"))',
    data=fishmeans,
    family=sm.families.Poisson()).fit()

print(lm.summary())

#%% Special GLM for ZX1+PTZ
#bdf['PTZ'] = bdf.treatment.str.contains('PTZ')
#bdf['ZX1'] = bdf.treatment.str.contains('ZX1')
#lm=smf.glm(formula='boutlength ~ PTZ*ZX1', data=bdf, family=sm.families.Gamma(link=sm.families.links.log)).fit()
#print(lm.summary())
#%% Mixed effects model
lme = smf.mixedlm(
    f'boutlength ~ C(treatment, Treatment(reference="{treatment_order[0]}"))*C(genotype, Treatment(reference="{genotype_order[0]}"))',
    data=bdf,
    groups=bdf.fish).fit()
print(lme.summary())
예제 #30
0
# %% LME modelling
#===============================================================================
chisqprob = lambda chisq, df: scipy.stats.chi2.sf(chisq, df)
def lrtest(llmin, llmax):
    lr = 2 * (llmax - llmin)
    p = chisqprob(lr, 1) # llmax has 1 dof more than llmin
    return lr, p

ss = long_df.copy()
# z-scoring
for cname in [cname for cname in ss.columns if cname not in ['sID', 'side', 'task']]:
    ss[cname] = scipy.stats.zscore(ss[cname])
ss.to_csv(ROOTPATH / '03_Derivatives' / 'allData_long_zscored.tsv',
          sep = '\t', index = False)
# Kappa values
lme_K = [smf.mixedlm('K ~ 1', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + SLF1_FA_contra', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + SLF1_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + SLF2_FA_contra', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + SLF2_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + SLF1_MD_contra', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + task * SLF1_MD_contra', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + task * SLF1_MD_contra + SLF2_MD_ipsi', data = ss, groups = ss['sID']).fit(reml = False),
         smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + task * SLF1_MD_contra + task * SLF2_MD_ipsi', data = ss, groups = ss['sID']).fit(reml = False)]
llf_K = [mdl.llf for mdl in lme_K]
예제 #31
0
def getModel_testR2(dat, formula='', params=[], mixedlm=True, verbose=False):
    '''
    Obtains the test R2 based on even/odd splits of the data
    '''

    if len(params) > 0 and len(formula) == 0:
        formula = getFormula(params)
    else:
        print('No Method of selecting parameters provided.')
        return np.nan, np.nan, []
    print('\nComputing mixedlm with formula: {}'.format(formula))

    try:
        dat_even = dat[dat['EvenTrial'] == True]
        dat_odd = dat[dat['EvenTrial'] == False]

        try:
            if mixedlm:
                md_even = smf.mixedlm(formula,
                                      data=dat_even,
                                      groups=dat_even["trID"])
            else:
                md_even = smf.ols(formula + 'trID', data=dat_even)
        except:
            md_even = smf.ols(formula, data=dat_even)

        mdf_even = md_even.fit()
        pred_odd = mdf_even.predict(dat_odd)

        try:
            if mixedlm:
                md_odd = smf.mixedlm(formula,
                                     data=dat_odd,
                                     groups=dat_odd["trID"])
            else:
                md_odd = smf.ols(formula + 'trID', data=dat_odd)
        except:
            md_odd = smf.ols(formula, data=dat_odd)

        mdf_odd = md_odd.fit()
        pred_even = mdf_odd.predict(dat_even)

        if verbose:
            print('\nPerformance Train-Even:Test-Odd')
            print("Train_aR2 = {0:.3f}".format(aR2(mdf_even, dat_even['zFR'])))
            print("Model_AICc = {0:.3f}".format(AICc(mdf_even)))
            print("Test_R2 = {0:.3f}".format(R2(pred_odd, dat_odd['zFR'])))
            print('\nPerformance Train-Odd:Test-Even')
            print("Train_aR2 = {0:.3f}".format(aR2(mdf_odd, dat_odd['zFR'])))
            print("Model_AICc = {0:.3f}".format(AICc(mdf_odd)))
            print("Test_R2 = {0:.3f}".format(R2(pred_even, dat_even['zFR'])))

        dat['Pred'] = np.zeros(dat.shape[0])
        dat.loc[dat['EvenTrial'] == True, 'Pred'] = pred_even
        dat.loc[dat['EvenTrial'] == False, 'Pred'] = pred_odd

        r2 = R2(dat['zFR'], dat['Pred'])
        print('\nOverall test R2: {0:.3f}'.format(r2))
        return r2
    except:
        print("Error",
              sys.exc_info()[0],
              sys.exc_info()[1],
              sys.exc_info()[2].tb_lineno)
        return np.nan
예제 #32
0
# To simplify this example we will only look at the right hand tapping
# condition so we now remove the left tapping conditions from the
# design matrix and GLM results
dm_cols_not_left = np.where(["Right" in c for c in dm.columns])[0]
dm = dm[[dm.columns[i] for i in dm_cols_not_left]]


# %%
# Run group-level model
# ---------------------------------------------------------------------
#
# A linear mixed effects (LME) model is used to determine the effect
# of FIR delay for each chromophore on the evoked response with participant
# (ID) as a random variable.

lme = smf.mixedlm('theta ~ -1 + delay:TidyCond:Chroma', df,
                  groups=df["ID"]).fit()

# The model is summarised below, and is not displayed here.
# You can display the model output using: lme.summary()


# %%
# Summarise group-level findings
# ---------------------------------------------------------------------
#
# Next the values from the model above are extracted into a dataframe for
# more convenient analysis below.
# A subset of the results is displayed, illustrating the estimated coefficients
# for oxyhaemoglobin (HbO) for the right hand tapping condition.

# Create a dataframe from LME model for plotting below
# comparison.
#
# Here are our import statements:

# ## Growth curves of pigs
#
# These are longitudinal data from a factorial experiment. The outcome
# variable is the weight of each pig, and the only predictor variable we
# will use here is "time".  First we fit a model that expresses the mean
# weight as a linear function of time, with a random intercept for each pig.
# The model is specified using formulas. Since the random effects structure
# is not specified, the default random effects structure (a random intercept
# for each group) is automatically used.

data = sm.datasets.get_rdataset('dietox', 'geepack').data
md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])
mdf = md.fit()
print(mdf.summary())

# Here is the same model fit in R using LMER:

# ```ipython
# %%R
# data(dietox, package='geepack')
# ```

# ```ipython
# %R print(summary(lmer('Weight ~ Time + (1|Pig)', data=dietox)))
# ```

# ```
예제 #34
0
    def run(self, h5_files, no_disks=100, out_dir=''):
        """Workflow of linear Mixed Models.

        Applies linear Mixed Models on bundles of subjects and saves the
        results in a directory specified by ``out_dir``.

        Parameters
        ----------

        h5_files : string
            Path to the input metric files. This path may
            contain wildcards to process multiple inputs at once.

        no_disks : integer, optional
            Number of disks used for dividing bundle into disks. (Default 100)

        out_dir : string, optional
            Output directory (default input file directory)

        """

        io_it = self.get_io_iterator()

        for file_path in io_it:

            logging.info('Applying metric {0}'.format(file_path))

            file_name, bundle_name, save_name = self.get_metric_name(file_path)
            logging.info(" file name = " + file_name)
            logging.info("file path = " + file_path)

            pvalues = np.zeros(no_disks)
            warnings.filterwarnings("ignore")
            # run mixed linear model for every disk
            for i in range(no_disks):
                disk_count = i + 1
                df = pd.read_hdf(file_path, where='disk=disk_count')

                logging.info("read the dataframe for disk number " +
                             str(disk_count))
                # check if data has significant data to perform LMM
                if len(df) < 10:
                    raise ValueError(
                        "Dataset for Linear Mixed Model is too small")

                criteria = file_name + " ~ group"
                md = smf.mixedlm(criteria, df, groups=df["subject"])

                mdf = md.fit()

                pvalues[i] = mdf.pvalues[1]

            x = list(range(1, len(pvalues) + 1))
            y = -1 * np.log10(pvalues)

            save_file = os.path.join(out_dir, save_name + "_pvalues.npy")
            np.save(save_file, pvalues)

            save_file = os.path.join(out_dir, save_name + "_pvalues_log.npy")
            np.save(save_file, y)

            save_file = os.path.join(out_dir, save_name + ".png")
            self.save_lmm_plot(save_file, file_name, bundle_name, x, y)
예제 #35
0
import scipy.stats as st
import seaborn as sns
import matplotlib

time_window = (0.4, 0.8)
ch_group = "parietal"
ch_type = "grad"

X, meta, times, info = assemble_epochs_new(ch_type=ch_type, baseline=None)
df, data = prepare_erp(times, time_window, ch_group, meta, X, info)
if ch_type == "mag":
    df.data *= 1e14
elif ch_type == "grad":
    df.data *= 1e12

md = smf.mixedlm("data ~ cond", data=df, groups="subj")
mdf = md.fit()
print(mdf.summary())

legend = []
times_sel = times[np.logical_and(times >= time_window[0],
                                 times < time_window[1])]
for cond in np.unique(df.cond):
    plt.plot(times, data[df.cond == cond, :, :].mean(axis=(0, 1)))
    legend.append(cond)
plt.legend(legend)
plt.show()

font = {'family': 'normal', 'weight': 'bold', 'size': 22}
matplotlib.rc('font', **font)
# QQ-plot shows that our data are not normal. Something to think about
예제 #36
0
print(auction_data.columns)

# %%
viol_plot = sns.catplot(x='auctionStimValue',
                        y='chosenAuctionAmount',
                        hue='auctionCondition',
                        data=auction_data,
                        palette="colorblind",
                        kind='violin',
                        height=6,
                        aspect=1.5,
                        legend=False)

plt.ylim([0, 500])
viol_plot.ax.legend(loc=2, fontsize=18)

# %%
### Fit model to data

md = smf.mixedlm("chosenAuctionAmount ~ auctionStimValue*auctionCondition",
                 auction_data,
                 groups=auction_data["subcode"])
mdf = md.fit()
print(mdf.summary())

# %%
## Diagnostic plots

auction_data['mixedlm_resid'] = mdf.resid
sns.pairplot(auction_data)
예제 #37
0
                             info, psd_params)

print(freqs)
if ch_type == "mag":
    df.data *= 1e14**2
elif ch_type == "grad":
    df.data *= 1e12**2

# df.data = np.log(df.data)  #
# df.data = stats.boxcox(df.data)[0]
# md = smf.mixedlm("data ~ C(condition, Treatment('HIGH HIT'))", data=df, groups="subject",
#                  # re_formula="~condition"
#                  )
md = smf.mixedlm(
    "data ~ condition",
    data=df,
    groups="subject",
    # re_formula="~condition"
)
mdf = md.fit()
print(mdf.summary())

# df = df[df.data < 8]

# md = smf.mixedlm("data ~ confidence * is_correct", data=df,
#                  # re_formula="~ 0 + confidence",
#                  groups="subject")
# mdf = md.fit(method="powell")
# print(mdf.summary())

dd = df.copy()
d_low = psd[dd.condition == "LOW HIT", :, :].mean(axis=(0, 1))
예제 #38
0
]

# Stat prep
d_var = "TonAng"


# aicd_thresh = 5
def aic_pval(a, b):
    return np.exp(
        (a - b) / 2)  # calculates the evidence ratio for 2 aic values


# Null model
print("Analyses for {}".format(d_var))
model = "{dv} ~ 1".format(dv=d_var)
res_0 = smf.mixedlm('{}'.format(model), data=NEMO,
                    groups=NEMO['Subject']).fit(reml=False)
print("Null model AIC =  ", res_0.aic)
null_aic = res_0.aic
last_aic = res_0.aic  # for deltas and comparisons

# Experimental variables
print("Testing Experiment Variables..")
print("TON")
model = "{dv} ~ {t}".format(dv=d_var, t=ton)
res_ton = smf.mixedlm('{}'.format(model), data=NEMO,
                      groups=NEMO['Subject']).fit(reml=False)
ton_aic = res_ton.aic
print("Ton model results -- AIC = ", ton_aic, ", AIC_delta = ",
      null_aic - ton_aic, ", AIC_p = ", aic_pval(ton_aic, null_aic))
print("EMO")
model = "{dv} ~ {e}".format(dv=d_var, e=emo)
예제 #39
0
df_2way_rm = pd.DataFrame(columns=["sub_id", "task", "condition", "my_value"])
my_row = 0
# unique subject-ID as additional factor
sub_id = 0
for sub in subs_list:
    sub_id = sub_id + 1
    for ind_t, task in enumerate(task_list):
        for ind_c, con in enumerate(condition_list):
            # generate random value here as example
            my_val = np.random.normal(ind_t + ind_c, 1, 1)[0]
            df_2way_rm.loc[my_row] = [sub_id, task, con, my_val]
            my_row = my_row + 1

# conduct ANOVA using mixedlm
my_model_fit = smf.mixedlm("my_value ~ task * condition",
                           df_2way_rm,
                           groups=df_2way_rm["sub_id"]).fit()
# get fixed effects
my_model_fit.summary()
# get random effects
my_model_fit.random_effects

# conduct ANOVA using AnovaRM
my_model_fit = AnovaRM(df_2way_rm,
                       'my_value',
                       'sub_id',
                       within=['task', 'condition']).fit()
print(my_model_fit.anova_table)

# --------
# 4-way ANOVA with between-group and within-group factors (repeated measures)
예제 #40
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 10 11:29:35 2021

@author: zachz
"""
#%% Imports

import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.io as spio
import pandas as pd

#%% Load in data

# use .spydata file

#%% make into one big dataframe

big_model = smf.mixedlm("tau ~ fr + brain_area + species",
                        df,
                        groups=df['dataset'])

fit = big_model.fit()

print(big_model.summary())
예제 #41
0
#pip install "plotnine==0.6.0" tu use ggplot in python
###
weight = [61, 100,  56, 113,  99, 103,  75,  62,  ## sire 1
            75, 102,  95, 103,  98, 115,  98,  94,  ## sire 2
            58,  60,  60,  57,  57,  59,  54, 100,  ## sire 3
            57,  56,  67,  59,  58, 121, 101, 101,  ## sire 4
            59,  46, 120, 115, 115,  93, 105,  75 ] ## sire 5
sire=np.array([1,2,3,4,5])
sire=np.repeat(sire,8, axis=0)
animals = {'weight': weight, 'sire': pd.Categorical(sire)}
animals = pd.DataFrame(data=animals)
animals.info()
# plot
sb.stripplot(x="sire", y="weight" ,data=animals, size=10, edgecolor='red', linewidth=0.5, ax=None, dodge=True, hue="sire")
##
md = smf.mixedlm("weight ~(1-sire)", animals, groups="sire" )
mdf = md.fit()
mdf.summary()
mdf.conf_int(alpha=0.025)
from patsy.contrasts import Sum
levels = [1,2,3,4,5]
contrast = Sum().code_without_intercept(levels)
aov = ols('weight ~ C(sire, Sum)',data=animals).fit()
table = sm.stats.anova_lm(aov, typ=2) # Type 2 ANOVA DataFrame
print(table)
aov.conf_int(alpha=0.05)
randomeffect=mdf.random_effects
randomeffect
###  TA plot
x=np.array(mdf.fittedvalues)
y=np.array(mdf.resid)
예제 #42
0
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 14 15:34:16 2015

@author: d
"""

import statsmodels.api as sm 
import statsmodels.formula.api as smf

#data = sm.datasets.get_rdataset("dietox", "geepack").data

md = smf.mixedlm("i ~ deprivation", meta, groups=meta["Genre"]) 
mdf = md.fit() 

print mdf.summary()
예제 #43
0
Threshold = (ControlData.quantile(0.75)['CV']
             - ControlData.quantile(0.25)['CV']) * 1.5 \
             + ControlData.quantile(0.75)['CV']
MinBVTV = 0.1 / 3 * 2

# 04 Filter data
CVFilter = Data['CV'] < Threshold
Windowing = Data['BVTV'] > MinBVTV
FilteredData = Data[Windowing & CVFilter]

# 05 Build and fit the model
Model = smf.mixedlm(
    "LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy + LogCV:(Sii+Sij+Sjj) -1",
    data=FilteredData,
    groups=FilteredData['Subject id'],
    vc_formula={
        "Sii": "Sii-1",
        "Sij": "Sij-1",
        "Sjj": "Sjj-1"
    })

Model_Fit = Model.fit()
print(Model_Fit.summary())

# 06 Verify residuals
QQPlot(Model_Fit.resid.values)

Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100)
Axes.plot(np.exp(Model_Fit.fittedvalues),
          Model_Fit.resid,
          linestyle='none',
                # Convert to dataframes
                df = pd.DataFrame(distances[face][fid]).T
                df, param_names = append_design(design_file, df)
                individ_dfs.append(df)

            df_params = pd.concat(individ_dfs, ignore_index=True)
            save_name = \
                os.path.join(output_path,
                             "{0}_face{1}_params.csv".format(set_name, face))
            df_params.to_csv(save_name)

            model = "*".join(param_names)

            result_tflux = sm.mixedlm(formula="dist_tflux ~ {}".format(model),
                                      data=df_params,
                                      groups=df_params["Cube"]).fit()
            save_name = \
                os.path.join(output_path,
                             "{0}_face{1}_tflux_fit.pkl".format(set_name, face))
            result_tflux.save(save_name)

            result_pflux = sm.mixedlm(formula="dist_pflux ~ {}".format(model),
                                      data=df_params,
                                      groups=df_params["Cube"]).fit()
            save_name = \
                os.path.join(output_path,
                             "{0}_face{1}_pflux_fit.pkl".format(set_name, face))
            result_pflux.save(save_name)

            result_sigma = sm.mixedlm(formula="dist_sigma ~ {}".format(model),
예제 #45
0
res_types = ['Trav_res', 'FP_res', 'Liu_res', 'Liu_pred_res']

# Specifiy models.
models = ['Travasarou', 'Foulser-Piggott', 'Liu', 'Liu_&_GMPE']

Avg_Bias = []
Tau = []
Phi = []

for res_type in res_types:

    # Select event IDs.
    df2 = df[['USGS_eventID', res_type]]

    mod = smf.mixedlm(" ".join([res_type, '~', '1']),
                      df2,
                      groups=df["USGS_eventID"])
    mod_fit = mod.fit()

    # Summary
    summary = mod_fit.summary

    # Random effects
    rdm_effects = mod_fit.random_effects

    # Fixed effects
    fxd_effects = mod_fit.fe_params.Intercept
    Avg_Bias.append(fxd_effects)

    # Random effect standard deviation (tau)
    tau = np.array(np.sqrt(mod_fit.cov_re))
예제 #46
0
파일: _mixedlm.py 프로젝트: biocore/gneiss
def mixedlm(formula, table, metadata, groups, **kwargs):
    """ Linear Mixed Effects Models applied to balances.

    Linear mixed effects (LME) models is a method for estimating
    parameters in a linear regression model with mixed effects.
    LME models are commonly used for repeated measures, where multiple
    samples are collected from a single source.  This implementation is
    focused on performing a multivariate response regression with mixed
    effects where the response is a matrix of balances (`table`), the
    covariates (`metadata`) are made up of external variables and the
    samples sources are specified by `groups`.

    T-statistics (`tvalues`) and p-values (`pvalues`) can be obtained to
    investigate to evaluate statistical significance for a covariate for a
    given balance.  Predictions on the resulting model can be made using
    (`predict`), and these results can be interpreted as either balances or
    proportions.

    Parameters
    ----------
    formula : str
        Formula representing the statistical equation to be evaluated.
        These strings are similar to how equations are handled in R.
        Note that the dependent variable in this string should not be
        specified, since this method will be run on each of the individual
        balances. See `patsy` [1]_ for more details.
    table : pd.DataFrame
        Contingency table where samples correspond to rows and
        balances correspond to columns.
    metadata: pd.DataFrame
        Metadata table that contains information about the samples contained
        in the `table` object.  Samples correspond to rows and covariates
        correspond to columns.
    groups : str
        Column name in `metadata` that specifies the groups.  These groups are
        often associated with individuals repeatedly sampled, typically
        longitudinally.
    **kwargs : dict
        Other arguments accepted into
        `statsmodels.regression.linear_model.MixedLM`

    Returns
    -------
    LMEModel
        Container object that holds information about the overall fit.
        This includes information about coefficients, pvalues and
        residuals from the resulting regression.

    References
    ----------
    .. [1] https://patsy.readthedocs.io/en/latest/

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> from gneiss.regression import mixedlm

    Here, we will define a table of balances with features `Y1`, `Y2`
    across 12 samples.

    >>> table = pd.DataFrame({
    ...   'u1': [ 1.00000053,  6.09924644],
    ...   'u2': [ 0.99999843,  7.0000045 ],
    ...   'u3': [ 1.09999884,  8.08474053],
    ...   'x1': [ 1.09999758,  1.10000349],
    ...   'x2': [ 0.99999902,  2.00000027],
    ...   'x3': [ 1.09999862,  2.99998318],
    ...   'y1': [ 1.00000084,  2.10001257],
    ...   'y2': [ 0.9999991 ,  3.09998418],
    ...   'y3': [ 0.99999899,  3.9999742 ],
    ...   'z1': [ 1.10000124,  5.0001796 ],
    ...   'z2': [ 1.00000053,  6.09924644],
    ...   'z3': [ 1.10000173,  6.99693644]},
    ..     index=['Y1', 'Y2']).T

    Now we are going to define some of the external variables to
    test for in the model.  Here we will be testing a hypothetical
    longitudinal study across 3 time points, with 4 patients
    `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1`
    and `z` and `u` were given treatment `2`.

    >>> metadata = pd.DataFrame({
    ...         'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
    ...         'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
    ...         'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
    ...     }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3',
    ...               'z1', 'z2', 'z3', 'u1', 'u2', 'u3'])

    Now we can run the linear mixed effects model on the balances.
    Underneath the hood, the proportions will be transformed into balances,
    so that the linear mixed effects models can be run directly on balances.
    Since each patient was sampled repeatedly, we'll specify them separately
    in the groups.  In the linear mixed effects model `time` and `treatment`
    will be simultaneously tested for with respect to the balances.

    >>> res = mixedlm('time + treatment', table, metadata,
    ...               groups='patient')

    See Also
    --------
    statsmodels.regression.linear_model.MixedLM
    ols

    """
    metadata = _type_cast_to_float(metadata.copy())
    data = pd.merge(table, metadata, left_index=True, right_index=True)
    if len(data) == 0:
        raise ValueError(("No more samples left.  Check to make sure that "
                          "the sample names between `metadata` and `table` "
                          "are consistent"))
    submodels = []
    for b in table.columns:
        # mixed effects code is obtained here:
        # http://stackoverflow.com/a/22439820/1167475
        stats_formula = '%s ~ %s' % (b, formula)
        mdf = smf.mixedlm(stats_formula, data=data,
                          groups=data[groups],
                          **kwargs)
        submodels.append(mdf)

    # ugly hack to get around the statsmodels object
    model = LMEModel(Y=table, Xs=None)
    model.submodels = submodels
    model.balances = table
    return model
예제 #47
0
def mix_strain(data, feature, print_opt=True,
               nstrain=3, search_range=(3, 12), degree=1):
    """
    Fit the linear mixed model onto our aggregate data. The fixed effects
    are the hour, strain, interactions between hour and strain; The random
    effect is mouse because we want to make sure that the different mouses
    will not give out any differences. We added two dummy variables:
    strain0 and strain1 to be our fixed effects.

    Parameters
    ----------
        data: data frame output from aggregate_data function
        feature: {"AS", "F", "IS", "M_AS", "M_IS", "W", "Distance"}
        print_opt: True or False
        nstrain: positive integer
        range: array contains two elements
        degree: positive integer

    Returns
    -------
    Two mixed model regression results which includes all the coefficients,
    t statistics and p values for corresponding coefficients; The first model
    includes interaction terms while the second model does not include the
    interaction terms

    Likelihood ratio test p values, if it is below our significance level,
    we can conclude that the different strains have significantly different
    time patterns

    Examples
    --------
    >>> result = mix_strain(data = aggregate_data("F",30), feature = "F",
    >>>          print_opt = False, degree = 2)
    >>> print(result)
    2.5025846540930469e-09

    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError(
            'Data must be a pandas data frame')
    if feature not in ALL_FEATURES:
        raise ValueError(
            'Input value must in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}')

    data["cycle"] = 0
    for i in range(nstrain):
        result = find_cycle(feature="W", strain=i, plot=False,
                            search_range_find=search_range)
        cycle = result[0][0]
        data.loc[data["strain"] == i, "cycle"] = cycle

    b = pd.get_dummies(data["strain"])
    data["strain0"] = b.ix[:, 0]
    data["strain1"] = b.ix[:, 1]
    data["strain2"] = b.ix[:, 2]
    data["hour2"] = np.array(data["hour"].values)**degree
    data = data.drop('strain', 1)
    names = data.columns.tolist()
    names[names.index(feature)] = 'feature'
    data.columns = names
    if degree == 1:
        md1 = smf.mixedlm("feature ~ hour + strain0 + strain1 + cycle \
                          + strain0*hour + strain1*hour", data,
                          groups=data["mouse"])
    else:
        md1 = smf.mixedlm("feature ~ hour + hour2 + strain0 + strain1 + \
                          strain0*hour+ strain1*hour + strain0*hour2+ \
                          strain1*hour2", data, groups=data["mouse"])

    mdf1 = md1.fit()
    like1 = mdf1.llf

    if print_opt:
        print(mdf1.summary())
    if degree == 1:
        md2 = smf.mixedlm("feature ~ hour + cycle + strain0 \
                          + strain1", data, groups=data["mouse"])
    else:
        md2 = smf.mixedlm("feature ~ hour + hour2 + cycle + strain0 + \
                          strain1", data, groups=data["mouse"])

    mdf2 = md2.fit()
    like2 = mdf2.llf

    if print_opt:
        print(mdf2.summary())
    fstat = 2 * abs(like1 - like2)
    p_v = chi2.pdf(fstat, df=2)

    return p_v