def run(self, h5_files, no_disks=100, out_dir=''): """Workflow of linear Mixed Models. Applies linear Mixed Models on bundles of subjects and saves the results in a directory specified by ``out_dir``. Parameters ---------- h5_files : string Path to the input metric files. This path may contain wildcards to process multiple inputs at once. no_disks : integer, optional Number of disks used for dividing bundle into disks. (Default 100) out_dir : string, optional Output directory (default input file directory) """ io_it = self.get_io_iterator() for file_path in io_it: logging.info('Applying metric {0}'.format(file_path)) file_name = os.path.basename(file_path)[:-3] df = pd.read_hdf(file_path) if len(df) < 100: raise ValueError("Dataset for Linear Mixed Model is too small") all_bundles = df.bundle.unique() # all_pvalues = [] for bundle in all_bundles: sub_af = df[df['bundle'] == bundle] # sub sample pvalues = np.zeros(no_disks) # run mixed linear model for every disk for i in range(no_disks): sub = sub_af[sub_af['disk#'] == (i+1)] # disk number if len(sub) > 0: criteria = file_name + " ~ group" md = smf.mixedlm(criteria, sub, groups=sub["subject"]) mdf = md.fit() pvalues[i] = mdf.pvalues[1] x = list(range(1, len(pvalues)+1)) y = -1*np.log10(pvalues) title = bundle + " on " + file_name + " Values" plot_file = os.path.join(out_dir, bundle + "_" + file_name + ".png") simple_plot(plot_file, title, x, y, "disk no", "-log10(pvalues)")
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) # test the names assert_(mod1.data.xnames == ["x1", "x2", "x3", "x4"]) assert_(mod1.data.exog_re_names == ["x_re1"]) assert_(mod1.data.exog_re_names_full == ["x_re1 RE"]) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) assert_(mod2.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod2.data.exog_re_names == ["exog_re"]) assert_(mod2.data.exog_re_names_full == ["exog_re RE"]) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") assert_(mod3.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod3.data.exog_re_names == ["exog_re"]) assert_(mod3.data.exog_re_names_full == ["exog_re RE"]) rslt3 = mod3.fit(start_params=rslt2.params) assert_allclose(rslt1.params, rslt3.params, rtol=1e-4) # Check default variance structure with non-formula model # creation, also use different exog_re that produces a zero # estimated variance parameter. exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt4 = mod4.fit() from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") assert_(mod5.data.exog_re_names == ["groups"]) assert_(mod5.data.exog_re_names_full == ["groups RE"]) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt5 = mod5.fit() assert_almost_equal(rslt4.params, rslt5.params)
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Check default variance structure, with formula.api exog_re = np.ones(len(endog), dtype=np.float64) mod3 = MixedLM(endog, exog, groups, exog_re) rslt3 = mod3.fit() from statsmodels.formula.api import mixedlm mod4 = mixedlm(fml, df, groups=groups) rslt4 = mod4.fit() assert_almost_equal(rslt3.params, rslt4.params)
def analyze_mixed_effects(data, effects, cov_name='group', data_name='data', **kwargs): betas = {} pvalues = {} conf_int = {} effects = list(effects) # copy # Build data data = _categorize_data(data, [data_name] + effects, **kwargs) for effect in effects: # Move axis to put effect in first position effects.remove(effect) effects.insert(0, effect) # Take 2 different values as ref. targets = np.unique(data[effect].values) model = mixedlm( "%s ~ C(%s, Sum('%s')) + " % (data_name, effect, targets[0]) + "+".join([" C(%s, Sum) "] * (len(effects) - 1)) % tuple(effects[1:]), data, groups=data[cov_name]).fit() # Retrieve the corresponding estimates this_betas = {} betas[effect] = this_betas this_pvalues = {} pvalues[effect] = this_pvalues this_conf_int = {} conf_int[effect] = this_conf_int for k in model.params.keys(): # Remove "C(" k_ = k[2:] if k_.startswith(effect): ename = k.split('[')[1][2:-1] this_betas[ename] = model.params[k] this_pvalues[ename] = model.pvalues[k] this_conf_int[ename] = \ model.conf_int()[1][k] - model.params[k] # Refit to get last target model = ols( "%s ~ C(%s, Sum('%s')) " % (data_name, effect, targets[1]) + "+".join([''] + [" C(%s, Sum) "] * (len(effects) - 1)) % tuple(effects[1:]), data).fit() key = "C(%s, Sum('%s'))[S.%s]" % (effect, targets[1], targets[0]) this_betas[targets[0]] = model.params[key] this_pvalues[targets[0]] = model.pvalues[key] this_conf_int[targets[0]] = (model.conf_int()[1][key] - model.params[key]) stop return betas, pvalues, conf_int
def analyze_effects(data, formula, model='ols', groups=None): """Measure the effects size of each categorical variables given in formula against dependent variable. Uses module smf.ols from stats model (statistics in Python) Parameters ---------- data : pandas data frame formula : str Formula used in the specified model to fit model to the data. See documentation of statsmodels.formula.api or related examples. model : str, {'ols', 'mixedlm'} Imported from statsmodels.formula.api groups : str keyword argument passed to mixedlm model. Required specificall when you do mixedlm model tests. Returns ------- model : instance of stats model whether 'ols' or 'mixedlm' fit() of the model on the data. print(model.summary()) to look at the summary of the fit() on data. params can be fetched as model.params pvalues can be fetched as model.pvalues """ if not isinstance(data, pd.DataFrame): raise ValueError("Given input 'data' should be like pandas Data frame." " You provided {0}".format(data)) if model not in ['ols', 'mixedlm']: raise ValueError("model={0} you specified is not implemented. " "Choose between 'ols' or 'mixedlm'".format(model)) if model == 'ols': model_fit = ols(formula=formula, data=data).fit() elif model == 'mixedlm': model_fit = mixedlm(formula=formula, data=data, groups=groups).fit() return model_fit
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300,4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1,1,1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1,1,1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:,k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") rslt3 = mod3.fit(start_params=rslt2.params) assert_allclose(rslt1.params, rslt3.params, rtol=1e-4) # Check default variance structure with non-formula model # creation. exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) rslt4 = mod4.fit(start_params=rslt2.params) from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") rslt5 = mod5.fit(start_params=rslt2.params) assert_almost_equal(rslt4.params, rslt5.params)
renames={ 'Session':{ 'ofM':'naïve', 'ofMaF':'acute', 'ofMcF1':'chronic (2w)', 'ofMcF2':'chronic (4w)', 'ofMpF':'post', }, }, ) plt.savefig('dr_activity.pdf') import statsmodels.formula.api as smf import numpy as np model = smf.mixedlm("t ~ Session * treatment", subjectdf, groups=subjectdf["subject"]) fit = model.fit() report = fit.summary() print(report) print(fit.params) omnibus_tests = np.eye(len(fit.params))[1:-1] omnibus_tests = omnibus_tests[:4] omnibus_tests[0,6] = -1 omnibus_tests[1,7] = -1 omnibus_tests[2,8] = -1 omnibus_tests[3,9] = -1 print(omnibus_tests) anova = fit.f_test(omnibus_tests) print(anova)
11: "Fall", 12: "Winter" } AQ['season'] = AQ['month'].map(dic) AQ = AQ.dropna() AQ = AQ[AQ['pm2.5'] > 0] AQ['pm25_log'] = np.log(AQ['pm2.5']) AQ_cv = AQ[AQ['cbwd'] == 'cv'] AQ_cv = AQ_cv[(AQ_cv['pm25_log'] > 2.2) & (AQ_cv['pm25_log'] < 6.8)] AQ_NE = AQ[AQ['cbwd'] == 'NE'] AQ_NE = AQ_NE[(AQ_NE['pm25_log'] > 0.5)] AQ_NW = AQ[AQ['cbwd'] == 'NW'] AQ_NW = AQ_NW[(AQ_NW['pm25_log'] > 0.5)] AQ_SE = AQ[AQ['cbwd'] == 'SE'] AQ_SE.sort_values(['pm25_log'], ascending=[False]) AQ_SE = AQ_SE[(AQ_SE['pm25_log'] > 0.5) & (AQ_SE['pm25_log'] < 6.291569)] AQ_new = pd.concat([AQ_cv, AQ_NE, AQ_NW, AQ_SE]) mixed = smf.mixedlm("pm25_log ~ year+month+day+hour+DEWP+TEMP+PRES+Is+Ir", AQ_new, groups=AQ_new["cbwd"], re_formula="~hour+PRES") mixed_fit = mixed.fit() print(mixed_fit.summary())
def create_df3(raw_data_location3): raw_data_location3 = open(r'C:\Users\danie\Documents\SURREY\Project_1\TaskSwitchingParadigm\online_TSP\second_online_cohort\pilot2_withoccurence.csv') path = (r'C:\Users\danie\Documents\SURREY\Project_1\TaskSwitchingParadigm\online_TSP\second_online_cohort') df = pd.read_csv(raw_data_location3, header = 0) df_behavstats = pd.DataFrame() df_behavstats1 = pd.DataFrame() df_behavstats2 = pd.DataFrame() df_behavstats3 = pd.DataFrame() df_switch_type = pd.DataFrame() # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # LOOP WHICH CALCULATES AND CONCATS MAD, SD, MRT, MED # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! df.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True) df_switch_type = df df_rt = df for group_i, group_v in df_rt.groupby(level=[0, 1, 2]): group_v = group_v.apply(pd.to_numeric, errors = 'coerce').dropna(how = 'all') mask = group_v.index.get_level_values(2) mrt = group_v['response_time'].mean() SD = group_v['response_time'].std() med = group_v['response_time'].median() switchtrial0 = group_v['response_time'].iloc[0] ## The below can be used if you want to use more than the 1st switch trial to calculate switch cost # switchtrial1 = group_v['response_time'].iloc[1] # if n > 2: # switchtrial2 = group_v['response_time'].iloc[2] group_v.at[group_i, 'mean_rt'] = mrt group_v.at[group_i, 'SD_rt'] = SD group_v.at[group_i, 'median_rt'] = med group_v.at[group_i, 'first_switch_rt'] = switchtrial0 group_v.reset_index(drop = False, inplace = True) df_behavstats1 = pd.concat([df_behavstats1, group_v], sort=False) df_behavstats1.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True) df_behavstats1.drop(df_behavstats1.columns[df_behavstats1.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # LOOP WHICH CALCULATES AND CONCATS SWITCH RT # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! for group_i, group_v in df_behavstats1.groupby(level=[0, 1, 2]): n = 0 for index, row in group_v.iterrows(): n = n + 1 # here dicates over how many trials the RT is averaged over (m), dependant on how many # trials are in the overall group (n). ## # eg, when the number of overall trials in the group is less than 3 (if n < 3), then # the number of trials to average over is 0 (m = 0), and the rows are left empty (np.nan). if n < 3: m = 0 for index, row in group_v.iterrows(): group_v.at[index, 'average_switch_rt'] = np.nan elif n >= 3 and n < 5: m = 2 elif n >= 5: m = 3 number_of_trials = 0 overall_rt = 0 # the 'islice' tells pandas to iterate with iterrows over the first 'm' rows, where 'm' is # dictated above and depends on the overall number of trials, 'n', in the group. for index, row in islice(group_v.iterrows(), m): number_of_trials = number_of_trials + 1 overall_rt = overall_rt + row['response_time'] j = (overall_rt/number_of_trials) group_v.at[index, 'average_switch_rt'] = j group_v.reset_index(drop = True, inplace = False) df_behavstats = pd.concat([df_behavstats, group_v], sort=True) df_behavstats = pd.concat([df_behavstats, df_switch_type.reindex(columns=df.columns)], axis=1) df_behavstats = df_behavstats.drop(columns=['response_time']) df_behavstats.drop_duplicates(subset="mean_rt", keep='first', inplace=True) df_behavstats.drop(df_behavstats.columns[df_behavstats.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True) # when a group has less than 3 trials in it, the switch_rt is not calculated (m = 0). # if there are NaN values in any of the rows of a column, that column returns NaN as a t-test # value for any t-test calculations it is involved in. therefore i have excluded those rows below: print("") print("") print('BELOW DISPLAYS THE GROUP(S) WHICH HAVE BEEN EXCLUDED AS THERE WERE LESS THAN') print('3 TRIALS IN THE GROUP, CAUSING A NaN VALUE FOR THE T-TEST CALCULATIONS:') print("") print(df_behavstats[df_behavstats.isna().any(axis=1)].index) df_behavstats = df_behavstats[pd.notnull(df_behavstats['average_switch_rt'])] print("") print("") df_behavstats.reset_index(drop=False, inplace=True) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # SWITCH-TYPE COLUMN # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! df_behavstats = df_behavstats.loc[:,~df_behavstats.columns.duplicated()] df_behavstats.set_index(['auto_participant_id', 'occurence', 'type'], inplace = True) for group_index, group_value in df_behavstats.groupby(level=[0, 1]): group_value.reset_index(drop = False, inplace = True) row_iterator = group_value.iterrows() _, previous = next(row_iterator) for index, row in group_value.iterrows(): if np.logical_and(row['changed'] == 1, index == 0): if row['type'] == 'ts-trial-digit-span': j = 'NONE-DS' if row['type'] == 'ts-trial-spatial-span': j = 'NONE-SS' if row['type'] == 'ts-trial-spatial-rotation': j = 'NONE-SR' if row['type'] == '': pass group_value.at[0, 'switch_type'] = j for index, row in row_iterator: j = 'none' if row['changed'] == 1: if row['type'] == 'ts-trial-digit-span' and previous['type'] == 'ts-trial-spatial-span': j = 'SS-DS' if row['type'] == 'ts-trial-digit-span' and previous['type'] == 'ts-trial-spatial-rotation': j = 'SR-DS' if row['type'] == 'ts-trial-spatial-span' and previous['type'] == 'ts-trial-digit-span': j = 'DS-SS' if row['type'] == 'ts-trial-spatial-span' and previous['type'] == 'ts-trial-spatial-rotation': j = 'SR-SS' if row['type'] == 'ts-trial-spatial-rotation' and previous['type'] == 'ts-trial-digit-span': j = 'DS-SR' if row['type'] == 'ts-trial-spatial-rotation' and previous['type'] == 'ts-trial-spatial-span': j = 'SS-SR' if row['type'] == '' and previous['type'] == 'ts-trial-spatial-span': pass if row['type'] == '' and previous['type'] == 'ts-trial-spatial-rotation': pass if row['type'] == '' and previous['type'] == 'ts-trial-digit-span': pass previous = row # group_value.reset_index(drop = True, inplace = True) group_value.at[index, 'switch_type'] = j df_behavstats = pd.concat([df_behavstats, group_value], sort=True) df_behavstats = df_behavstats.dropna(subset=['switch_type']) df_behavstats.to_csv('WithSwitchType.csv') # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # LMEM # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! md1 = smf.mixedlm("first_switch_rt ~ auto_participant_id ", df_behavstats, groups=df_behavstats["type"]) mdf1 = md1.fit() print('*************************************************************************************') print('LINEAR MIXED EFFECTS MODELS') print(mdf1.summary()) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # ANOVAs # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! model = ols( 'first_switch_rt ~ +C(type)+C(auto_participant_id)+C(switch_type)+C(occurence)+C(occurence):C(type)+C(occurence):C(auto_participant_id)+C(occurence):C(switch_type)+C(type):C(switch_type)+C(auto_participant_id):C(switch_type)+C(type):C(auto_participant_id)', data=df_behavstats ).fit() anova_table = sm.stats.anova_lm(model, typ=2) print('*************************************************************************************') print('ANOVA TABLE FIRST SWITCH RT') print(anova_table) model1 = ols( 'mean_rt ~ +C(type)+C(auto_participant_id)+C(switch_type)+C(occurence)+C(occurence):C(type)+C(occurence):C(auto_participant_id)+C(occurence):C(switch_type)+C(type):C(switch_type)+C(auto_participant_id):C(switch_type)+C(type):C(auto_participant_id)', data=df_behavstats ).fit() anova_table1 = sm.stats.anova_lm(model1, typ=2) print('*************************************************************************************') print('ANOVA TABLE MEAN RT') print(anova_table1) model2 = ols( 'median_rt ~ +C(type)+C(auto_participant_id)+C(switch_type)+C(occurence)+C(occurence):C(type)+C(occurence):C(auto_participant_id)+C(occurence):C(switch_type)+C(type):C(switch_type)+C(auto_participant_id):C(switch_type)+C(type):C(auto_participant_id)', data=df_behavstats ).fit() anova_table2 = sm.stats.anova_lm(model2, typ=2) print('*************************************************************************************') print('ANOVA TABLE MEDIAN SWITCH RT') print(anova_table2) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # T-TESTS # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! mean = df_behavstats['mean_rt'] SD = df_behavstats['SD_rt'] median = df_behavstats['median_rt'] # Check here is mean or median is different from one another; if so, decide which to use. If not, move ahead with one or the other. g1 = stats.ttest_ind(median, mean, equal_var = False) print('*************************************************************************************') print('TTEST for difference between mean and median rt: All tasks, all occurences =', g1) rt1 = df_behavstats['first_switch_rt'] rt123 = df_behavstats['average_switch_rt'] f1 = stats.ttest_rel(rt1, rt123) print('*************************************************************************************') print('TTEST for difference between first and average rt: All tasks, all occurences =', f1) df_behavstats.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True) for group_i, group_v in df_behavstats.groupby(level=[1,2]): group_v.reset_index(drop = False, inplace = True) for index, row in group_v.iterrows(): task = group_v['type'].loc[1] occurence = group_v['occurence'].loc[1] SRT = group_v['first_switch_rt'] MRT = group_v['average_switch_rt'] n = len(MRT) x = range(0,n,1) ttest = stats.ttest_rel(MRT, SRT) print('*************************************************************************************') print('TASK TYPE=', task, 'OCCURENCE =', occurence) print('TTEST BETWEEN FIRST AND AVERAGE RT=', ttest) fig, axMRT = plt.subplots() color = 'tab:red' axMRT.set_xlabel('Number of trials') axMRT.set_ylabel('Mean RT', color=color) axMRT.plot(x, MRT, color=color) axMRT.set_ylim([0,3000]) axMRT.tick_params(axis='y') axSRT = axMRT.twinx() # instantiate a second axes that shares the same x-axis color= 'tab:blue' axSRT.set_ylabel('Switch RT', color=color) # we already handled the x-label with ax1 axSRT.plot(x, SRT, color=color) axSRT.set_ylim([0,3000]) axSRT.tick_params(axis='y') t = str(task) o = str(occurence) name = 'Figures/ScatterPlot for' + t + 'Occurence=' + o +'.jpeg' plt.legend(loc='upper left'); axSRT.text(0, 10, name, bbox={'facecolor': 'wheat', 'alpha': 0.5, 'pad': 10}) fig.tight_layout() # otherwise the right y-label is slightly clipped fig.savefig(name, dpi=400) df_behavstats.reset_index(drop = False, inplace = True) # # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # # Testing for learning effects # # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # df_behavstats.set_index(['auto_participant_id', 'type', 'occurence'], inplace = True) # for group_i, group_v in df_behavstats.groupby(level=[1,2]): # group_v.reset_index(drop = False, inplace = True) # last_occ = [] # first_occ = [] # for index, row in group_v.iterrows(): # if group_v['occurence'].loc[1] == 8 : # last_occ = group_v['average_switch_rt'] # print('last_occ',last_occ) # elif group_v['occurence'].loc[1] == 0 : # first_occ = group_v['average_switch_rt'] # print('first_occ',first_occ) # else: # continue # task = group_v['type'].loc[1] # occurence = group_v['occurence'].loc[1] # ttest = stats.ttest_rel(last_occ, first_occ) # print('*************************************************************************************') # print('TASK TYPE=', task, 'OCCURENCE =', occurence) # print('TTEST BETWEEN FIRST LAST AVERAGE RT=', ttest) # df_behavstats.reset_index(drop = False, inplace = True) # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # Plots! # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ax1 = sns.boxplot(x='type', y='mean_rt', data=df_behavstats) ax1 = sns.swarmplot(x='type', y='mean_rt', data=df_behavstats, color=".25") figure1 = ax1.get_figure() figure1.savefig('Figures/boxplot_Mean_ShowDataPoints.png', dpi=400) plt.close() ax2 = sns.boxplot(x='type', y='mean_rt', hue='occurence', data=df_behavstats) figure2 = ax2.get_figure() figure2.savefig('Figures/boxplot_Mean_byTaskType.png', dpi=400) plt.close() ax3 = sns.boxplot(x='type', y='first_switch_rt', data=df_behavstats) ax3 = sns.swarmplot(x='type', y='first_switch_rt', data=df_behavstats, color=".25") figure3 = ax3.get_figure() figure3.savefig('Figures/boxplot_Switch_ShowDataPoints.png', dpi=400) plt.close() ax4 = sns.boxplot(x='type', y='first_switch_rt', hue='occurence', data=df_behavstats) figure4 = ax4.get_figure() figure4.savefig('Figures/boxplot_Switch_byTaskType.png', dpi=400) plt.close() # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # Write ttests to a .csv file # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! # a = stats.ttest_ind(mean, rt1) # MEANvsAVRT = stats.ttest_ind(mean, rt123) # d = stats.ttest_ind(median, rt1) # MEDvsAVRT = stats.ttest_ind(median, rt123) # standard_t_tests = [a,MEANvsAVRT,d,MEDvsAVRT] # a1 = stats.ttest_ind(mean, rt1, equal_var = False) # MEANvsAVRT1 = stats.ttest_ind(mean, rt123, equal_var = False) # d1 = stats.ttest_ind(median, rt1, equal_var = False) # MEDvsAVRT1 = stats.ttest_ind(median, rt123, equal_var = False) # welchs_t_tests = [a1,MEANvsAVRT1,d1,MEDvsAVRT1] # t_data = {'standard':standard_t_tests, 'welchs':welchs_t_tests} # t_rows = ['mean_vs_rt1', 'mean_vs_rt123', 'med_vs_rt1', 'med_vs_rt123'] # df_t_tests = pd.DataFrame(data=t_data, index=t_rows) # name='TTests.csv' # dest = os.path.join(path, name) # df_t_tests.to_csv(dest) return df_behavstats
# Drop any other NaNs data = data.dropna() # Let's model some stuff! Treat the different CASA versions as random mixed # effect data["CASAVer"][data["CASAVer"] == 440] = 0 data["CASAVer"][data["CASAVer"] == 453] = 1 data["CASAVer"][data["CASAVer"] == 460] = 2 # data["CASAVer"][data["CASAVer"] == 470] = 3 # Create a version without any diverging cleans good_data = data[data["peak_res"] < 0.01] # Sum sum_model = sm.mixedlm("sum ~ Tclean*AllFields*MScale*Mask*Model", data=data, groups=data["CASAVer"]).fit(reml=False) print(sum_model.summary()) # Can't use Tclean. Makes matrix singular. sum_model_good = sm.mixedlm("sum ~ AllFields*MScale*Mask*Model", data=good_data, groups=good_data["CASAVer"]).fit(reml=False) print(sum_model_good.summary()) # Dominated by model (duh) # Median median_model = \ sm.mixedlm("median ~ Tclean*AllFields*MScale*Mask*Model", data=data, groups=data["CASAVer"]).fit(reml=False) print(median_model.summary()) # Can't use Tclean. Makes matrix singular.
# -*- coding: utf-8 -*- """ Created on Thu Jun 30 16:28:13 2016 @author: emg """ import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf df = pd.read_csv('/Users/emg/Programmming/GitHub/dissertation/data_handling/ranked_data.csv', index_col=0) df = df[df['author'] != '[deleted]'] df = df[df['author'] != '#NAME?'] df = df[df['author'] != 'AutoModerator'] df = df[df['author'] != 'AskScienceModerator'] df['count'] = 1 sample = df[df['subreddit']=='AskAnthropology'] md = smf.mixedlm('score ~ rank', df, groups=df['author'], re_formula='~rank') mdf = md.fit() print(mdf.summary())
# 03 Build linear models if Group == 'Healthy': HealthySystem = LinearSystem.copy() elif Group == 'OI': OISystem = LinearSystem.copy() ## Standard linear models Healthy_LM = smf.ols("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1", data=HealthySystem).fit() OI_LM = smf.ols("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1", data=OISystem).fit() ## Linear mixed-effect models Healthy_LMM = smf.mixedlm("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1", data=HealthySystem, groups=HealthySystem['Scan'], vc_formula={ "IF": "IF-1" }).fit(reml=True) OI_LMM = smf.mixedlm("LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy - 1", data=OISystem, groups=OISystem['Scan'], vc_formula={ "IF": "IF-1" }).fit(reml=True) ## Likelihood ratio test Healthy_p = LikelihoodRatioTest(Healthy_LM, Healthy_LMM, 1) print('p value of LRT for healthy group: ' + str(Healthy_p)) OI_p = LikelihoodRatioTest(OI_LM, OI_LMM, 1) print('p value of LRT for OI group: ' + str(OI_p))
import numpy as np import pandas as pd import scipy as sp import statsmodels.api as sm import statsmodels.formula.api as smf np.random.seed(1234) N = 1000 nGroups = 40 nPerGroup = N//nGroups x = np.random.normal(size=N) group = np.repeat(np.arange(0, nGroups), nPerGroup) ranEff = np.random.normal(scale=.5, size=nGroups) coefs = np.array([2,.2]).reshape(2,1) randInts = coefs[0] + ranEff[group] X = np.array([randInts, x]).T y = randInts + x*coefs[1] + np.random.normal(scale=.75, size=N) y.shape df = pd.DataFrame({'x':x, 'group':group, 'y':y}) ### Mode mod0 = smf.mixedlm('y ~ x', data=df, groups=df['group']) mod = mod0.fit() print(mod.summary()) # intercept RE is in sd reModel = mod.random_effects['Intercept'][:] sm.qqplot(reModel, sp.stats.norm, line='45', fit=True, scale=.25)
def coefplot(formula, data, intercept=False, ci=95, min_tvalue=2, mixed_effect="Cube", sig_color='r', nonsig_color='k', add_legend=False, figsize=None): """Plot the coefficients from a linear model. Parameters ---------- formula : string patsy formula for ols model data : dataframe data for the plot; formula terms must appear in columns groupby : grouping object, optional object to group data with to fit conditional models intercept : bool, optional if False, strips the intercept term before plotting ci : float, optional size of confidence intervals palette : {seaborn color palette, color}, optional palette for the horizonal plots or a single color """ try: import statsmodels.formula.api as sf _has_statsmodels = True except ImportError: _has_statsmodels = False if not _has_statsmodels: raise ImportError("The `coefplot` function requires statsmodels") import pandas as pd alpha = 1 - ci / 100 if mixed_effect is None: model = sf.ols(formula, data).fit() else: model = sf.mixedlm(formula, data, groups=data[mixed_effect]).fit(reml=False) coefs = model.params # Order by term order. This should be a default in statsmodels IMO. ind = list(coefs.index) ind.sort(key=lambda x: x.count(":")) ind = pd.Index(ind) coefs = coefs[ind] cis = model.conf_int(alpha).T[ind].T tvals = np.abs(model.tvalues)[ind] model_effects = ind # Possibly ignore the intercept if not intercept: coefs = coefs.ix[model_effects != "Intercept"] cis = cis.ix[model_effects != "Intercept"] tvals = tvals.ix[model_effects != "Intercept"] model_effects = model_effects[model_effects != "Intercept"] if mixed_effect is not None: coefs = coefs.ix[model_effects != "Intercept RE"] cis = cis.ix[model_effects != "Intercept RE"] tvals = tvals.ix[model_effects != "Intercept RE"] model_effects = model_effects[model_effects != "Intercept RE"] n_terms = len(coefs) rep_name = {'fc': "F", "pb": r'$\beta$', 'm': r'$\mathcal{M}$', 'k': r'$k$', 'sf': r'$\zeta$', 'vp': r'$\alpha$'} if figsize is None: w, h = mpl.rcParams["figure.figsize"] hsize = lambda n: n * (h / 2) wsize = lambda n: n * (w / (4 * (n / 5))) figsize = (hsize(1.5), wsize(n_terms)) fig, ax = p.subplots(1, 1, figsize=figsize) for i, term in enumerate(coefs.index): if tvals[term] < min_tvalue: color = nonsig_color symbol = '^' else: color = sig_color symbol = 'o' low, high = cis.ix[term] ax.plot([low, high], [i, i], c=color, solid_capstyle="round", lw=2.5) ax.plot(coefs.ix[term], i, symbol, c=color, ms=8) # ax.plot([i, i], [low, high], c=color, # solid_capstyle="round", lw=2.5) # ax.plot(i, coefs.ix[term], symbol, c=color, ms=8) ax.set_ylim(-.5, n_terms - .5) ax.set_xlabel("Coefficent Values") ax.axvline(0, ls="--", c="dimgray") ax.grid(True) # Rotate x ticks labels for label in ax.get_xticklabels(): label.set_rotation(90) # Change to the nice parameter labels altered_labels = [] for mod in model_effects: if ":" in mod: new_mod = [] for param in mod.split(":"): new_mod.append(rep_name[param]) else: new_mod = [rep_name[mod]] altered_labels.append(":".join(new_mod)) p.yticks(np.arange(len(model_effects)), altered_labels, rotation=0) if add_legend: # Add in a legend with the symbols wrt min_tval sig_artist = p.Line2D((0, 1), (0, 0), color=sig_color, marker='o', linestyle='-') nonsig_artist = p.Line2D((0, 1), (0, 0), color=nonsig_color, marker='^', linestyle='-') ax.legend([sig_artist, nonsig_artist], [r"$t$-value > {0:.2f}".format(min_tvalue), r"$t$-value < {0:.2f}".format(min_tvalue)], frameon=True, loc='best') p.tight_layout()
# comparison. # # Here are our import statements: # ## Growth curves of pigs # # These are longitudinal data from a factorial experiment. The outcome # variable is the weight of each pig, and the only predictor variable we # will use here is "time". First we fit a model that expresses the mean # weight as a linear function of time, with a random intercept for each pig. # The model is specified using formulas. Since the random effects structure # is not specified, the default random effects structure (a random intercept # for each group) is automatically used. data = sm.datasets.get_rdataset('dietox', 'geepack').data md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"]) mdf = md.fit() print(mdf.summary()) # Here is the same model fit in R using LMER: # Note that in the Statsmodels summary of results, the fixed effects and # random effects parameter estimates are shown in a single table. The # random effect for animal is labeled "Intercept RE" in the Statmodels # output above. In the LME4 output, this effect is the pig intercept under # the random effects section. # # There has been a lot of debate about whether the standard errors for # random effect variance and covariance parameters are useful. In LME4, # these standard errors are not displayed, because the authors of the # package believe they are not very informative. While there is good reason
### Only kelz vs lisu # AQ_new = AQ_new[np.logical_and(np.isin(AQ_new['system1'],['kelz','lisu']),np.isin(AQ_new['system2'],['kelz','lisu']))] ### Only musicians # goldmsi_med = np.median(AQ_new['goldmsi']) # AQ_new = AQ_new[AQ_new['goldmsi']>goldmsi_med] ## Co-dependent variables: * ## Random variables: +(var/user_id) --> /user_id ## Also check multiple regression (simple) ## Also check multiple logistic regression ## Mixed Effects Model mixed = smf.mixedlm( "difficulty ~ f_diff+f_system2+f_1_2+goldmsi+recognised+answer", AQ_new, groups='question_id') mixed_fit = mixed.fit() print(mixed_fit.summary()) ### logistic regression # feature_columns = ["recognised","difficulty","f_system1","f_system2","goldmsi"] # X = AQ_new.loc[:, feature_columns].values # # y=AQ_new.answer # clf = LogisticRegression().fit(X, y) # print clf.coef_ # for c,feat in zip(clf.coef_[0],feature_columns): # print feat, 'coef', c ###############################################################
renames={ 'Session':{ 'ofM':'naïve', 'ofMaF':'acute', 'ofMcF1':'chronic (2w)', 'ofMcF2':'chronic (4w)', 'ofMpF':'post', }, }, ) plt.savefig('drs_activity_full.pdf') import statsmodels.formula.api as smf import numpy as np model = smf.mixedlm("t ~ Session * treatment", subjectdf, groups=subjectdf["subject"]) fit = model.fit() report = fit.summary() print(report) print(fit.params) omnibus_tests = np.eye(len(fit.params))[1:-1] omnibus_tests = omnibus_tests[:4] omnibus_tests[0,6] = -1 omnibus_tests[1,7] = -1 omnibus_tests[2,8] = -1 omnibus_tests[3,9] = -1 print(omnibus_tests) anova = fit.f_test(omnibus_tests) print(anova)
# the dPTE label connections to look at columns = ["Subject","Emo","Connection","dPTE"] emos = ["neg","pos"] conxs = ["LO-IP","LO-IT","LO-SM","LO-TT","IP-IT", "IP-SM","IP-TT","IT-SM","IT-TT","SM-TT"] # separate LMMs for each connection # setup LMM # for Alpha band print("Doing stats on ALPHA dPTE connections") for conx in conxs: print("Calculating LMM for connection: {}".format(conx)) df = df_NEM_dPTE_alpha[df_NEM_dPTE_alpha.Connection == conx].infer_objects() try: res_0 = smf.mixedlm('dPTE ~ 1', data=df, groups=df['Subject']).fit(reml=False) print("Null model:") print(res_0.params) print(res_0.summary()) print("AIC: {}".format(res_0.aic)) except: print("Null Model could not converge...") continue try: res_emo = smf.mixedlm('dPTE ~ Emo', data=df, groups=df['Subject']).fit(reml=False) except: print("Emotion Model could not converge...") continue print("Emotion model:") print(res_emo.params) print(res_emo.summary())
# In[19]: print 'Dimensions with missing dependent variable removed: %d rows, %d columns' % (child_study_data_no_na.shape[0], child_study_data_no_na.shape[1]) # In[20]: # fitting a model without repeated measures (i.e., no random slopes) # because I cannot entirely figure out what Statsmodels is doing # with the random effects structure # also notice that Statsmodels does not fit quite as quickly as lme4 m100_lmm = smf.mixedlm('M100LatCorr ~ Hem + Cond + Case + Site + Age_Calc ', data = child_study_data_no_na, groups = child_study_data_no_na['Subject']) m100_lmm_fit = m100_lmm.fit() # And now to inspect the output: # In[21]: print m100_lmm_fit.summary() # ##Calling R from Python # The `rpy2` package allows for calling R directly from Python. Let's use it to see if we get similar results for the coefficients and how the summary output differs. The cells below illustrate how to do this directly in Python (with or without a notebook). #
linewidth=2) # Plotting all Groups in one graph Legend = sns.barplot(y='Score', x='Group', data=MMMM, palette="colorblind", hue='Condition') Legend.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), shadow=True, ncol=1) plt.show() # Model for L1 Spanish md = smf.mixedlm("Score ~ Tense*Type", SMMM, groups=SMMM["Group"]) mdf1 = md.fit(method="cg") print(mdf1.summary()) # Model for L1 Croatian md = smf.mixedlm("Score ~ Tense*Type", CMMM, groups=CMMM["Group"]) mdf2 = md.fit(method='nm') print(mdf2.summary()) # Model for L1 German md = smf.mixedlm("Score ~ Tense*Type", GMMM, groups=GMMM["Group"]) mdf3 = md.fit(method='nm') print(mdf3.summary()) # Model for ALL md = smf.mixedlm("Score ~ Tense*Type*Group", MMMM, groups=MMMM["Group"]) mdf = md.fit(method='cg') print(mdf.summary())
# generate the data if (root / f'data{n}_{J}.pkl').is_file(): with open(root / f'data{n}_{J}.pkl', 'rb') as f: data, xijs, yijs = pickle.load(f) else: yijs, xijs = gendata(n, J) data = None datadf = arrtodf(yijs, xijs) cutoffs1 = np.linspace(0, 2, 21) cutoffs2 = np.linspace(0.05, 1, 20) burnin = 5000 num = 50000 numboot = 1000 # frequentist method, do the regression md = smf.mixedlm("Y~1+X", datadf, groups=datadf["group"]) mdf = md.fit() tauhat, sigmahat = np.sqrt(mdf.cov_re.iloc[0, 0]), np.sqrt(mdf.scale) beta0hat, beta1hat = mdf.fe_params['Intercept'], mdf.fe_params['X'] f1pvalues = frepvaluebeta(mdf, cutoffs1) # frequentist method, bootstrap, for tau if (root / f'bootstrap{n}_{J}_{numboot}.pkl').is_file(): with open(root / f'bootstrap{n}_{J}_{numboot}.pkl', 'rb') as f: taubtps = pickle.load(f) else: taubtps = tauparabootstrap( n, J, [beta0hat, beta1hat, sigmahat, tauhat], xijs, numboot) f2pvalues = frepvaluetau(taubtps, cutoffs2)
row['MSI'] = MSI[i] row['sync'] = sync[i] row['hemi'] = hemis.index(hemi) row['condition'] = subset.index(condition) row['interaction'] = row['sync'] * row['condition'] for tt in range(161): row['data_tt%s' % (tt)] = scores[i][tt] rows_list.append(row) df = pd.DataFrame(rows_list) # hemi % condition coef_bin = np.zeros([161, 4]) for tt in range(161): md = smf.mixedlm('data_tt%s ~ hemi * condition' % (tt), df, groups=df['subject_number'], re_formula="~hemi*condition") mdf = md.fit() # add intercept, and first 3 factor coefs to the bin coef_bin[tt, :] = mdf.params[0:4] var_labels = ['intercept', 'hemi', 'condition', 'interaction'] times = np.linspace(-200, 600, 161) for coef, lab in zip(coef_bin.T, var_labels): plt.plot(times, coef, label=lab) plt.legend() plt.show() #____________________________________________________________ # y-preds compare accuracy
N, nfixed = np.shape(X) _, nrandm = np.shape(Z) # generate data w0 = [5.0, 1.0, 2.0, 8.0, 1.0, 1.0] + np.random.randn(6) #w0 -= np.mean(w0) #w0 = np.random.normal(size=(M,)) z0 = np.random.normal(size=(N1, )) * 10 Pheno = np.dot(X, w0) + np.dot(Z, z0) + Y.flatten() beta0 = np.linalg.lstsq(X, Pheno) fixedpred = np.argmax(X, axis=1) randmpred = np.argmax(Z, axis=1) tbltest['Pheno'] = Pheno md = smf.mixedlm("Pheno ~ Condi1*Condi2", tbltest, groups=tbltest["subj"]) mdf = md.fit() Y = np.expand_dims(Pheno, axis=1) fitted = mdf.fittedvalues fe_params = pd.DataFrame(mdf.fe_params, columns=['LMM']) fe_params.index = Terms random_effects = pd.DataFrame(mdf.random_effects) random_effects = random_effects.transpose() random_effects = random_effects.rename(index=str, columns={'groups': 'LMM'}) #%% Real data Tbl_beh = pd.read_csv('./behavioral_data.txt', delimiter='\t') Tbl_beh["subj"] = Tbl_beh["subj"].astype('category') tbltest = Tbl_beh formula = "rt ~ group*orientation*identity"
WRKY = pd.read_csv("ath_wrky_scores_length_list.txt", header=None) WRKY.rename(columns={0: "ds", 1: "length"}, inplace=True) WRKY["family"] = "WRKY" MYB = pd.read_csv("ath_myb_scores_length_list.txt", header=None) MYB.rename(columns={0: "ds", 1: "length"}, inplace=True) MYB["family"] = "MYB" all_lengths = ap2.append(WRKY) all_lengths = all_lengths.append(MYB) all_family_df = all_exonnumbers.merge(all_lengths, on=["ds", "family"]) all_family_df.to_csv("families_data.csv") md = smf.mixedlm("ds ~ exon_number+length", all_family_df, groups=all_family_df["family"]) mdf = md.fit() print(mdf.summary()) #---- ap2 ap2_df = all_family_df.query("family == 'ap2'").copy() md = smf.mixedlm("ds ~ exon_number+length", ap2_df, groups=ap2_df["family"]) mdf = md.fit() print(mdf.summary()) #---- wrky wrky_df = all_family_df.query("family == 'WRKY'").copy()
'mep_category_percentile', 'mep_size' ], axis=1), df_powers2['mep_category_absolute_binary']) model = LinearRegression().fit(x_train, y_train) y_pred = model.predict(x_test) y_pred = y_pred > 0.5 acc = accuracy_score(y_test, y_pred) acc = { 'sub': 'all including sub variable', 'accuracy': acc, 'r2': model.score(x_test, y_test) } accuracy_list.append(acc) print(acc) sns.heatmap(confusion_matrix(y_test, y_pred), annot=True) plt.xlabel('Target') plt.ylabel('Prediction') #%% md = smf.mixedlm("mep_category_absolute_binary ~ power_beta", df_powers[[ 'power_alpha', 'power_beta', 'power_gamma', 'power_beta_gamma', 'power_b2a', 'power_g2a', 'power_bg2a', 'sub', 'mep_category_absolute_binary' ]], groups=df_powers["sub"]) mdf = md.fit() print(mdf.summary()) #%%
step = pm.NUTS(model.vars, scaling=start) with model: trace = pm.sample(3000, step, start) #%% pm.traceplot(trace) dftmp = pm.df_summary(trace, varnames=['group_effects']) print(dftmp['mean']) import statsmodels.formula.api as smf # from patsy import dmatrices import pandas as pd tbl = pd.DataFrame(predictors, columns=['C1', 'C2', 'C3']) tbl['group'] = pd.Series(group, dtype="category") tbl['yd'] = y md2 = smf.mixedlm("yd ~ -1 + C1 + C2 + C3", tbl, groups=tbl["group"]) mdf2 = md2.fit() print(mdf2.summary()) #%% X = np.tile(group_predictors[group], (1, 3)) * predictors beta0 = np.linalg.lstsq(X, y) fitted = np.dot(X, beta0[0]) import matplotlib.pyplot as plt plt.figure() plt.plot(y, 'k') plt.plot(fitted, 'g') dftmp = pm.df_summary(trace[1000:], varnames=['mu_est']) testdf = np.asarray(dftmp['mean']) plt.plot(testdf, 'r')
def _train(self, X, y): # Initialize the output mapping = {} # Estimate target type, if necessary if self.binomial_target is None: if len(y.unique()) <= 2: binomial_target = True else: binomial_target = False else: binomial_target = self.binomial_target # The estimation does not have to converge -> at least converge to the same value. np.random.seed(2001) for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') values = switch.get('mapping') data = self._rename_and_merge(X, y, col) try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if binomial_target: # Classification, returns (regularized) log odds per category as stored in vc_mean # Note: md.predict() returns: output = fe_mean + vcp_mean + vc_mean[category] md = bgmm.from_formula('target ~ 1', { 'a': '0 + C(feature)' }, data).fit_vb() index_names = [ int( float( re.sub(r'C\(feature\)\[(\S+)\]', r'\1', index_name))) for index_name in md.model.vc_names ] estimate = pd.Series(md.vc_mean, index=index_names) else: # Regression, returns (regularized) mean deviation of the observation's category from the global mean md = smf.mixedlm('target ~ 1', data, groups=data['feature']).fit() tmp = dict() for key, value in md.random_effects.items(): tmp[key] = value[0] estimate = pd.Series(tmp) except np.linalg.LinAlgError: # Singular matrix -> just return all zeros estimate = pd.Series(np.zeros(len(values)), index=values) # Ignore unique columns. This helps to prevent overfitting on id-like columns if len(X[col].unique()) == len(y): estimate[:] = 0 if self.handle_unknown == 'return_nan': estimate.loc[-1] = np.nan elif self.handle_unknown == 'value': estimate.loc[-1] = 0 if self.handle_missing == 'return_nan': estimate.loc[values.loc[np.nan]] = np.nan elif self.handle_missing == 'value': estimate.loc[-2] = 0 mapping[col] = estimate return mapping
def execute_glm(merneuro, int_cols, areas, formula, re_f): """Execute GLM Input: -------- merneuro: pd.DataFrame shape: linguistiq features (_conv, _part and _diff for each features) + brain areas as columns; sessions as rows int_cols: list list of strings, interest columns name, prgram argument "functions" areas: list list of areas in the neuro file. extracted before renaming occured formula: str raw formula for smf.mixedlm() re_f: str re_formula for smf.mixedlm() Output: -------- pvalues: dict contains models pvalues, shape {'int_col': {'formula': np.array}} estimates: dict contains models estimates, shape {'int_col': {'formula': np.array}} """ import warnings from statsmodels.tools.sm_exceptions import ConvergenceWarning # logging errors: either ConvergenceWarning or RuntimeWarning # saving in pvalues = {} estimates = {} for c in int_cols: print(c) p_c_dic = {} e_c_dic = {} for formula_part in ['_part', '_conv', '_diff']: int_cols = ['Intercept', c + formula_part] + ([ 'Agent[T.R]', c + formula_part + ':Agent[T.R]' ] if re.search('Agent', formula) is not None else []) start_time = time.time() print('\t', formula_part) p_f_dic = [] e_f_dic = [] for ar in areas: formula_1 = formula.format(str(ar).zfill(3), c + formula_part) print(formula_1) md = smf.mixedlm(formula_1, merneuro, groups=merneuro["locutor"], re_formula=re_f) with warnings.catch_warnings(record=True) as w: mdf = md.fit() # Add warnings to model data p_to_dic = mdf.pvalues[int_cols] p_to_dic['Warning'] = None if len(w) == 0 else str( w[-1].category).replace("<class '", '').replace( "'>", '').split('.')[-1] e_to_dic = mdf.fe_params e_to_dic['Warning'] = None if len(w) == 0 else str( w[-1].category).replace("<class '", '').replace( "'>", '').split('.')[-1] # Add to dic - no need to add "area" bc continuous set of areas, starting at 0 (control) p_f_dic.append(p_to_dic) e_f_dic.append(e_to_dic) p_c_dic[formula_part] = pd.DataFrame(p_f_dic) e_c_dic[formula_part] = pd.DataFrame(e_f_dic) print("\tElapsed: {0:4.2f}".format(time.time() - start_time)) pvalues[c] = p_c_dic estimates[c] = e_c_dic metadata = { 'pvalues': [c.replace(formula_part, '{}') for c in p_c_dic[formula_part].columns], 'estimates': [c.replace(formula_part, '{}') for c in e_c_dic[formula_part].columns] } return pvalues, estimates, metadata
# for treatment in treatment_order[1:]: # comparison_group = data_to_compare[data_to_compare.treatment==treatment] # pvalue = scipy.stats.ttest_ind(control_group[var], comparison_group[var]).pvalue # result_table.append(dict(genotype=geno, variable=var, control_vs=treatment, pvalue=pvalue)) #result_table=pd.DataFrame(result_table) #result_table['pvalue_adjusted']=sm.stats.multipletests(result_table.pvalue, method='hommel')[1] #print(result_table) #%% More advanced stats import statsmodels.formula.api as smf import statsmodels.api as sm # Test the counts of long_bouts per fish in each treatment group lm = smf.glm( formula= f'long_bouts ~ C(treatment, Treatment(reference="{treatment_order[0]}"))*C(genotype, Treatment(reference="{genotype_order[0]}"))', data=fishmeans, family=sm.families.Poisson()).fit() print(lm.summary()) #%% Special GLM for ZX1+PTZ #bdf['PTZ'] = bdf.treatment.str.contains('PTZ') #bdf['ZX1'] = bdf.treatment.str.contains('ZX1') #lm=smf.glm(formula='boutlength ~ PTZ*ZX1', data=bdf, family=sm.families.Gamma(link=sm.families.links.log)).fit() #print(lm.summary()) #%% Mixed effects model lme = smf.mixedlm( f'boutlength ~ C(treatment, Treatment(reference="{treatment_order[0]}"))*C(genotype, Treatment(reference="{genotype_order[0]}"))', data=bdf, groups=bdf.fish).fit() print(lme.summary())
# %% LME modelling #=============================================================================== chisqprob = lambda chisq, df: scipy.stats.chi2.sf(chisq, df) def lrtest(llmin, llmax): lr = 2 * (llmax - llmin) p = chisqprob(lr, 1) # llmax has 1 dof more than llmin return lr, p ss = long_df.copy() # z-scoring for cname in [cname for cname in ss.columns if cname not in ['sID', 'side', 'task']]: ss[cname] = scipy.stats.zscore(ss[cname]) ss.to_csv(ROOTPATH / '03_Derivatives' / 'allData_long_zscored.tsv', sep = '\t', index = False) # Kappa values lme_K = [smf.mixedlm('K ~ 1', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + SLF1_FA_contra', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + SLF1_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + SLF2_FA_contra', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + SLF2_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + SLF1_MD_contra', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + task * SLF1_MD_contra', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + task * SLF1_MD_contra + SLF2_MD_ipsi', data = ss, groups = ss['sID']).fit(reml = False), smf.mixedlm('K ~ task + side + task * SLF1_FA_contra + task * SLF1_FA_ipsi + task * SLF2_FA_contra + task * SLF2_FA_ipsi + task * SLF1_MD_contra + task * SLF2_MD_ipsi', data = ss, groups = ss['sID']).fit(reml = False)] llf_K = [mdl.llf for mdl in lme_K]
def getModel_testR2(dat, formula='', params=[], mixedlm=True, verbose=False): ''' Obtains the test R2 based on even/odd splits of the data ''' if len(params) > 0 and len(formula) == 0: formula = getFormula(params) else: print('No Method of selecting parameters provided.') return np.nan, np.nan, [] print('\nComputing mixedlm with formula: {}'.format(formula)) try: dat_even = dat[dat['EvenTrial'] == True] dat_odd = dat[dat['EvenTrial'] == False] try: if mixedlm: md_even = smf.mixedlm(formula, data=dat_even, groups=dat_even["trID"]) else: md_even = smf.ols(formula + 'trID', data=dat_even) except: md_even = smf.ols(formula, data=dat_even) mdf_even = md_even.fit() pred_odd = mdf_even.predict(dat_odd) try: if mixedlm: md_odd = smf.mixedlm(formula, data=dat_odd, groups=dat_odd["trID"]) else: md_odd = smf.ols(formula + 'trID', data=dat_odd) except: md_odd = smf.ols(formula, data=dat_odd) mdf_odd = md_odd.fit() pred_even = mdf_odd.predict(dat_even) if verbose: print('\nPerformance Train-Even:Test-Odd') print("Train_aR2 = {0:.3f}".format(aR2(mdf_even, dat_even['zFR']))) print("Model_AICc = {0:.3f}".format(AICc(mdf_even))) print("Test_R2 = {0:.3f}".format(R2(pred_odd, dat_odd['zFR']))) print('\nPerformance Train-Odd:Test-Even') print("Train_aR2 = {0:.3f}".format(aR2(mdf_odd, dat_odd['zFR']))) print("Model_AICc = {0:.3f}".format(AICc(mdf_odd))) print("Test_R2 = {0:.3f}".format(R2(pred_even, dat_even['zFR']))) dat['Pred'] = np.zeros(dat.shape[0]) dat.loc[dat['EvenTrial'] == True, 'Pred'] = pred_even dat.loc[dat['EvenTrial'] == False, 'Pred'] = pred_odd r2 = R2(dat['zFR'], dat['Pred']) print('\nOverall test R2: {0:.3f}'.format(r2)) return r2 except: print("Error", sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2].tb_lineno) return np.nan
# To simplify this example we will only look at the right hand tapping # condition so we now remove the left tapping conditions from the # design matrix and GLM results dm_cols_not_left = np.where(["Right" in c for c in dm.columns])[0] dm = dm[[dm.columns[i] for i in dm_cols_not_left]] # %% # Run group-level model # --------------------------------------------------------------------- # # A linear mixed effects (LME) model is used to determine the effect # of FIR delay for each chromophore on the evoked response with participant # (ID) as a random variable. lme = smf.mixedlm('theta ~ -1 + delay:TidyCond:Chroma', df, groups=df["ID"]).fit() # The model is summarised below, and is not displayed here. # You can display the model output using: lme.summary() # %% # Summarise group-level findings # --------------------------------------------------------------------- # # Next the values from the model above are extracted into a dataframe for # more convenient analysis below. # A subset of the results is displayed, illustrating the estimated coefficients # for oxyhaemoglobin (HbO) for the right hand tapping condition. # Create a dataframe from LME model for plotting below
# comparison. # # Here are our import statements: # ## Growth curves of pigs # # These are longitudinal data from a factorial experiment. The outcome # variable is the weight of each pig, and the only predictor variable we # will use here is "time". First we fit a model that expresses the mean # weight as a linear function of time, with a random intercept for each pig. # The model is specified using formulas. Since the random effects structure # is not specified, the default random effects structure (a random intercept # for each group) is automatically used. data = sm.datasets.get_rdataset('dietox', 'geepack').data md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"]) mdf = md.fit() print(mdf.summary()) # Here is the same model fit in R using LMER: # ```ipython # %%R # data(dietox, package='geepack') # ``` # ```ipython # %R print(summary(lmer('Weight ~ Time + (1|Pig)', data=dietox))) # ``` # ```
def run(self, h5_files, no_disks=100, out_dir=''): """Workflow of linear Mixed Models. Applies linear Mixed Models on bundles of subjects and saves the results in a directory specified by ``out_dir``. Parameters ---------- h5_files : string Path to the input metric files. This path may contain wildcards to process multiple inputs at once. no_disks : integer, optional Number of disks used for dividing bundle into disks. (Default 100) out_dir : string, optional Output directory (default input file directory) """ io_it = self.get_io_iterator() for file_path in io_it: logging.info('Applying metric {0}'.format(file_path)) file_name, bundle_name, save_name = self.get_metric_name(file_path) logging.info(" file name = " + file_name) logging.info("file path = " + file_path) pvalues = np.zeros(no_disks) warnings.filterwarnings("ignore") # run mixed linear model for every disk for i in range(no_disks): disk_count = i + 1 df = pd.read_hdf(file_path, where='disk=disk_count') logging.info("read the dataframe for disk number " + str(disk_count)) # check if data has significant data to perform LMM if len(df) < 10: raise ValueError( "Dataset for Linear Mixed Model is too small") criteria = file_name + " ~ group" md = smf.mixedlm(criteria, df, groups=df["subject"]) mdf = md.fit() pvalues[i] = mdf.pvalues[1] x = list(range(1, len(pvalues) + 1)) y = -1 * np.log10(pvalues) save_file = os.path.join(out_dir, save_name + "_pvalues.npy") np.save(save_file, pvalues) save_file = os.path.join(out_dir, save_name + "_pvalues_log.npy") np.save(save_file, y) save_file = os.path.join(out_dir, save_name + ".png") self.save_lmm_plot(save_file, file_name, bundle_name, x, y)
import scipy.stats as st import seaborn as sns import matplotlib time_window = (0.4, 0.8) ch_group = "parietal" ch_type = "grad" X, meta, times, info = assemble_epochs_new(ch_type=ch_type, baseline=None) df, data = prepare_erp(times, time_window, ch_group, meta, X, info) if ch_type == "mag": df.data *= 1e14 elif ch_type == "grad": df.data *= 1e12 md = smf.mixedlm("data ~ cond", data=df, groups="subj") mdf = md.fit() print(mdf.summary()) legend = [] times_sel = times[np.logical_and(times >= time_window[0], times < time_window[1])] for cond in np.unique(df.cond): plt.plot(times, data[df.cond == cond, :, :].mean(axis=(0, 1))) legend.append(cond) plt.legend(legend) plt.show() font = {'family': 'normal', 'weight': 'bold', 'size': 22} matplotlib.rc('font', **font) # QQ-plot shows that our data are not normal. Something to think about
print(auction_data.columns) # %% viol_plot = sns.catplot(x='auctionStimValue', y='chosenAuctionAmount', hue='auctionCondition', data=auction_data, palette="colorblind", kind='violin', height=6, aspect=1.5, legend=False) plt.ylim([0, 500]) viol_plot.ax.legend(loc=2, fontsize=18) # %% ### Fit model to data md = smf.mixedlm("chosenAuctionAmount ~ auctionStimValue*auctionCondition", auction_data, groups=auction_data["subcode"]) mdf = md.fit() print(mdf.summary()) # %% ## Diagnostic plots auction_data['mixedlm_resid'] = mdf.resid sns.pairplot(auction_data)
info, psd_params) print(freqs) if ch_type == "mag": df.data *= 1e14**2 elif ch_type == "grad": df.data *= 1e12**2 # df.data = np.log(df.data) # # df.data = stats.boxcox(df.data)[0] # md = smf.mixedlm("data ~ C(condition, Treatment('HIGH HIT'))", data=df, groups="subject", # # re_formula="~condition" # ) md = smf.mixedlm( "data ~ condition", data=df, groups="subject", # re_formula="~condition" ) mdf = md.fit() print(mdf.summary()) # df = df[df.data < 8] # md = smf.mixedlm("data ~ confidence * is_correct", data=df, # # re_formula="~ 0 + confidence", # groups="subject") # mdf = md.fit(method="powell") # print(mdf.summary()) dd = df.copy() d_low = psd[dd.condition == "LOW HIT", :, :].mean(axis=(0, 1))
] # Stat prep d_var = "TonAng" # aicd_thresh = 5 def aic_pval(a, b): return np.exp( (a - b) / 2) # calculates the evidence ratio for 2 aic values # Null model print("Analyses for {}".format(d_var)) model = "{dv} ~ 1".format(dv=d_var) res_0 = smf.mixedlm('{}'.format(model), data=NEMO, groups=NEMO['Subject']).fit(reml=False) print("Null model AIC = ", res_0.aic) null_aic = res_0.aic last_aic = res_0.aic # for deltas and comparisons # Experimental variables print("Testing Experiment Variables..") print("TON") model = "{dv} ~ {t}".format(dv=d_var, t=ton) res_ton = smf.mixedlm('{}'.format(model), data=NEMO, groups=NEMO['Subject']).fit(reml=False) ton_aic = res_ton.aic print("Ton model results -- AIC = ", ton_aic, ", AIC_delta = ", null_aic - ton_aic, ", AIC_p = ", aic_pval(ton_aic, null_aic)) print("EMO") model = "{dv} ~ {e}".format(dv=d_var, e=emo)
df_2way_rm = pd.DataFrame(columns=["sub_id", "task", "condition", "my_value"]) my_row = 0 # unique subject-ID as additional factor sub_id = 0 for sub in subs_list: sub_id = sub_id + 1 for ind_t, task in enumerate(task_list): for ind_c, con in enumerate(condition_list): # generate random value here as example my_val = np.random.normal(ind_t + ind_c, 1, 1)[0] df_2way_rm.loc[my_row] = [sub_id, task, con, my_val] my_row = my_row + 1 # conduct ANOVA using mixedlm my_model_fit = smf.mixedlm("my_value ~ task * condition", df_2way_rm, groups=df_2way_rm["sub_id"]).fit() # get fixed effects my_model_fit.summary() # get random effects my_model_fit.random_effects # conduct ANOVA using AnovaRM my_model_fit = AnovaRM(df_2way_rm, 'my_value', 'sub_id', within=['task', 'condition']).fit() print(my_model_fit.anova_table) # -------- # 4-way ANOVA with between-group and within-group factors (repeated measures)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Feb 10 11:29:35 2021 @author: zachz """ #%% Imports import numpy as np import matplotlib.pyplot as plt import statsmodels.api as sm import statsmodels.formula.api as smf import scipy.io as spio import pandas as pd #%% Load in data # use .spydata file #%% make into one big dataframe big_model = smf.mixedlm("tau ~ fr + brain_area + species", df, groups=df['dataset']) fit = big_model.fit() print(big_model.summary())
#pip install "plotnine==0.6.0" tu use ggplot in python ### weight = [61, 100, 56, 113, 99, 103, 75, 62, ## sire 1 75, 102, 95, 103, 98, 115, 98, 94, ## sire 2 58, 60, 60, 57, 57, 59, 54, 100, ## sire 3 57, 56, 67, 59, 58, 121, 101, 101, ## sire 4 59, 46, 120, 115, 115, 93, 105, 75 ] ## sire 5 sire=np.array([1,2,3,4,5]) sire=np.repeat(sire,8, axis=0) animals = {'weight': weight, 'sire': pd.Categorical(sire)} animals = pd.DataFrame(data=animals) animals.info() # plot sb.stripplot(x="sire", y="weight" ,data=animals, size=10, edgecolor='red', linewidth=0.5, ax=None, dodge=True, hue="sire") ## md = smf.mixedlm("weight ~(1-sire)", animals, groups="sire" ) mdf = md.fit() mdf.summary() mdf.conf_int(alpha=0.025) from patsy.contrasts import Sum levels = [1,2,3,4,5] contrast = Sum().code_without_intercept(levels) aov = ols('weight ~ C(sire, Sum)',data=animals).fit() table = sm.stats.anova_lm(aov, typ=2) # Type 2 ANOVA DataFrame print(table) aov.conf_int(alpha=0.05) randomeffect=mdf.random_effects randomeffect ### TA plot x=np.array(mdf.fittedvalues) y=np.array(mdf.resid)
# -*- coding: utf-8 -*- """ Created on Fri Aug 14 15:34:16 2015 @author: d """ import statsmodels.api as sm import statsmodels.formula.api as smf #data = sm.datasets.get_rdataset("dietox", "geepack").data md = smf.mixedlm("i ~ deprivation", meta, groups=meta["Genre"]) mdf = md.fit() print mdf.summary()
Threshold = (ControlData.quantile(0.75)['CV'] - ControlData.quantile(0.25)['CV']) * 1.5 \ + ControlData.quantile(0.75)['CV'] MinBVTV = 0.1 / 3 * 2 # 04 Filter data CVFilter = Data['CV'] < Threshold Windowing = Data['BVTV'] > MinBVTV FilteredData = Data[Windowing & CVFilter] # 05 Build and fit the model Model = smf.mixedlm( "LogSxy ~ Sii + Sij + Sjj + LogBVTV + Logmxy + LogCV:(Sii+Sij+Sjj) -1", data=FilteredData, groups=FilteredData['Subject id'], vc_formula={ "Sii": "Sii-1", "Sij": "Sij-1", "Sjj": "Sjj-1" }) Model_Fit = Model.fit() print(Model_Fit.summary()) # 06 Verify residuals QQPlot(Model_Fit.resid.values) Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) Axes.plot(np.exp(Model_Fit.fittedvalues), Model_Fit.resid, linestyle='none',
# Convert to dataframes df = pd.DataFrame(distances[face][fid]).T df, param_names = append_design(design_file, df) individ_dfs.append(df) df_params = pd.concat(individ_dfs, ignore_index=True) save_name = \ os.path.join(output_path, "{0}_face{1}_params.csv".format(set_name, face)) df_params.to_csv(save_name) model = "*".join(param_names) result_tflux = sm.mixedlm(formula="dist_tflux ~ {}".format(model), data=df_params, groups=df_params["Cube"]).fit() save_name = \ os.path.join(output_path, "{0}_face{1}_tflux_fit.pkl".format(set_name, face)) result_tflux.save(save_name) result_pflux = sm.mixedlm(formula="dist_pflux ~ {}".format(model), data=df_params, groups=df_params["Cube"]).fit() save_name = \ os.path.join(output_path, "{0}_face{1}_pflux_fit.pkl".format(set_name, face)) result_pflux.save(save_name) result_sigma = sm.mixedlm(formula="dist_sigma ~ {}".format(model),
res_types = ['Trav_res', 'FP_res', 'Liu_res', 'Liu_pred_res'] # Specifiy models. models = ['Travasarou', 'Foulser-Piggott', 'Liu', 'Liu_&_GMPE'] Avg_Bias = [] Tau = [] Phi = [] for res_type in res_types: # Select event IDs. df2 = df[['USGS_eventID', res_type]] mod = smf.mixedlm(" ".join([res_type, '~', '1']), df2, groups=df["USGS_eventID"]) mod_fit = mod.fit() # Summary summary = mod_fit.summary # Random effects rdm_effects = mod_fit.random_effects # Fixed effects fxd_effects = mod_fit.fe_params.Intercept Avg_Bias.append(fxd_effects) # Random effect standard deviation (tau) tau = np.array(np.sqrt(mod_fit.cov_re))
def mixedlm(formula, table, metadata, groups, **kwargs): """ Linear Mixed Effects Models applied to balances. Linear mixed effects (LME) models is a method for estimating parameters in a linear regression model with mixed effects. LME models are commonly used for repeated measures, where multiple samples are collected from a single source. This implementation is focused on performing a multivariate response regression with mixed effects where the response is a matrix of balances (`table`), the covariates (`metadata`) are made up of external variables and the samples sources are specified by `groups`. T-statistics (`tvalues`) and p-values (`pvalues`) can be obtained to investigate to evaluate statistical significance for a covariate for a given balance. Predictions on the resulting model can be made using (`predict`), and these results can be interpreted as either balances or proportions. Parameters ---------- formula : str Formula representing the statistical equation to be evaluated. These strings are similar to how equations are handled in R. Note that the dependent variable in this string should not be specified, since this method will be run on each of the individual balances. See `patsy` [1]_ for more details. table : pd.DataFrame Contingency table where samples correspond to rows and balances correspond to columns. metadata: pd.DataFrame Metadata table that contains information about the samples contained in the `table` object. Samples correspond to rows and covariates correspond to columns. groups : str Column name in `metadata` that specifies the groups. These groups are often associated with individuals repeatedly sampled, typically longitudinally. **kwargs : dict Other arguments accepted into `statsmodels.regression.linear_model.MixedLM` Returns ------- LMEModel Container object that holds information about the overall fit. This includes information about coefficients, pvalues and residuals from the resulting regression. References ---------- .. [1] https://patsy.readthedocs.io/en/latest/ Examples -------- >>> import pandas as pd >>> import numpy as np >>> from gneiss.regression import mixedlm Here, we will define a table of balances with features `Y1`, `Y2` across 12 samples. >>> table = pd.DataFrame({ ... 'u1': [ 1.00000053, 6.09924644], ... 'u2': [ 0.99999843, 7.0000045 ], ... 'u3': [ 1.09999884, 8.08474053], ... 'x1': [ 1.09999758, 1.10000349], ... 'x2': [ 0.99999902, 2.00000027], ... 'x3': [ 1.09999862, 2.99998318], ... 'y1': [ 1.00000084, 2.10001257], ... 'y2': [ 0.9999991 , 3.09998418], ... 'y3': [ 0.99999899, 3.9999742 ], ... 'z1': [ 1.10000124, 5.0001796 ], ... 'z2': [ 1.00000053, 6.09924644], ... 'z3': [ 1.10000173, 6.99693644]}, .. index=['Y1', 'Y2']).T Now we are going to define some of the external variables to test for in the model. Here we will be testing a hypothetical longitudinal study across 3 time points, with 4 patients `x`, `y`, `z` and `u`, where `x` and `y` were given treatment `1` and `z` and `u` were given treatment `2`. >>> metadata = pd.DataFrame({ ... 'patient': [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], ... 'treatment': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], ... 'time': [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] ... }, index=['x1', 'x2', 'x3', 'y1', 'y2', 'y3', ... 'z1', 'z2', 'z3', 'u1', 'u2', 'u3']) Now we can run the linear mixed effects model on the balances. Underneath the hood, the proportions will be transformed into balances, so that the linear mixed effects models can be run directly on balances. Since each patient was sampled repeatedly, we'll specify them separately in the groups. In the linear mixed effects model `time` and `treatment` will be simultaneously tested for with respect to the balances. >>> res = mixedlm('time + treatment', table, metadata, ... groups='patient') See Also -------- statsmodels.regression.linear_model.MixedLM ols """ metadata = _type_cast_to_float(metadata.copy()) data = pd.merge(table, metadata, left_index=True, right_index=True) if len(data) == 0: raise ValueError(("No more samples left. Check to make sure that " "the sample names between `metadata` and `table` " "are consistent")) submodels = [] for b in table.columns: # mixed effects code is obtained here: # http://stackoverflow.com/a/22439820/1167475 stats_formula = '%s ~ %s' % (b, formula) mdf = smf.mixedlm(stats_formula, data=data, groups=data[groups], **kwargs) submodels.append(mdf) # ugly hack to get around the statsmodels object model = LMEModel(Y=table, Xs=None) model.submodels = submodels model.balances = table return model
def mix_strain(data, feature, print_opt=True, nstrain=3, search_range=(3, 12), degree=1): """ Fit the linear mixed model onto our aggregate data. The fixed effects are the hour, strain, interactions between hour and strain; The random effect is mouse because we want to make sure that the different mouses will not give out any differences. We added two dummy variables: strain0 and strain1 to be our fixed effects. Parameters ---------- data: data frame output from aggregate_data function feature: {"AS", "F", "IS", "M_AS", "M_IS", "W", "Distance"} print_opt: True or False nstrain: positive integer range: array contains two elements degree: positive integer Returns ------- Two mixed model regression results which includes all the coefficients, t statistics and p values for corresponding coefficients; The first model includes interaction terms while the second model does not include the interaction terms Likelihood ratio test p values, if it is below our significance level, we can conclude that the different strains have significantly different time patterns Examples -------- >>> result = mix_strain(data = aggregate_data("F",30), feature = "F", >>> print_opt = False, degree = 2) >>> print(result) 2.5025846540930469e-09 """ if not isinstance(data, pd.DataFrame): raise ValueError( 'Data must be a pandas data frame') if feature not in ALL_FEATURES: raise ValueError( 'Input value must in {"AS", "F", "M_AS", "M_IS", "W", "Distance"}') data["cycle"] = 0 for i in range(nstrain): result = find_cycle(feature="W", strain=i, plot=False, search_range_find=search_range) cycle = result[0][0] data.loc[data["strain"] == i, "cycle"] = cycle b = pd.get_dummies(data["strain"]) data["strain0"] = b.ix[:, 0] data["strain1"] = b.ix[:, 1] data["strain2"] = b.ix[:, 2] data["hour2"] = np.array(data["hour"].values)**degree data = data.drop('strain', 1) names = data.columns.tolist() names[names.index(feature)] = 'feature' data.columns = names if degree == 1: md1 = smf.mixedlm("feature ~ hour + strain0 + strain1 + cycle \ + strain0*hour + strain1*hour", data, groups=data["mouse"]) else: md1 = smf.mixedlm("feature ~ hour + hour2 + strain0 + strain1 + \ strain0*hour+ strain1*hour + strain0*hour2+ \ strain1*hour2", data, groups=data["mouse"]) mdf1 = md1.fit() like1 = mdf1.llf if print_opt: print(mdf1.summary()) if degree == 1: md2 = smf.mixedlm("feature ~ hour + cycle + strain0 \ + strain1", data, groups=data["mouse"]) else: md2 = smf.mixedlm("feature ~ hour + hour2 + cycle + strain0 + \ strain1", data, groups=data["mouse"]) mdf2 = md2.fit() like2 = mdf2.llf if print_opt: print(mdf2.summary()) fstat = 2 * abs(like1 - like2) p_v = chi2.pdf(fstat, df=2) return p_v