def fixed_effect_3level_model(dataframe): """ Multi-level model_2_sci includes intercept, variables as fixed effect. :param dataframe: a data frame with student ID, school ID, country ID, science, math, reading, and other five selected variables as predictors. :return: the model results """ # Fixed effects three-level model model_2_sci = Lmer( 'log_science ~ IBTEACH + WEALTH ' '+ ESCS + female + Sch_science_resource ' '+ (1 | SchoolID/CountryID)', data=dataframe) # model must be fitted in order to get estimate results model_2_sci.fit(REML=False) # print summary since auto-generated result doesn't include fixed effects print(model_2_sci.summary()) model_2_sci.plot_summary() sns.regplot(x='Sch_science_resource', y='residuals', data=model_2_sci.data, fit_reg=False) # Inspecting overall fit sns.regplot(x='fits', y='log_science', units='CountryID', data=model_2_sci.data, fit_reg=True) return model_2_sci
def random_effect_2level_model(dataframe): """ Multi-level model_1_sci includes intercept, variable as fixed and the interaction term random on country level. :param dataframe: a data frame with student ID, school ID, country ID, science, math, reading, and other five selected variables as predictors. :return: the model results """ # Random intercept and slope two-level model: model_1_sci = Lmer('Science ~ female + (female*ESCS | CountryID)', data=dataframe) # model must be fitted in order to get estimate results model_1_sci.fit(REML=False) # print summary since auto-generated result doesn't include fixed effects print(model_1_sci.summary()) model_1_sci.plot_summary() # Visualizing random effect of a predictor model_1_sci.plot('female', plot_ci=True, ylabel='Predicted log_science') sns.regplot(x='female', y='residuals', data=model_1_sci.data, fit_reg=False) # Inspecting overall fit sns.regplot(x='fits', y='log_science', units='CountryID', data=model_1_sci.data, fit_reg=True) return model_1_sci
def run_linear_mixed_model_for_initialization(Y, G, cov, z): num_tests = Y.shape[1] F_betas = [] C_betas = [] residuals = [] model_eq = 'y ~ g' for cov_num in range(cov.shape[1]): model_eq = model_eq + ' + x' + str(cov_num) model_eq = model_eq + ' + (1|z)' # 119, 103 for test_number in range(num_tests): print(test_number) y_vec = Y[:, test_number] g_vec = G[:, test_number] dd = {'y': y_vec, 'z': z, 'g': g_vec} num_covs = cov.shape[1] for cov_num in range(num_covs): dd['x' + str(cov_num)] = cov[:, cov_num] df = pd.DataFrame(dd) model = Lmer(model_eq, data=df) model.fit() pdb.set_trace() residuals.append(model.residuals) print( np.mean(model.residuals / g_vec) / np.std(model.residuals / g_vec)) print('\n') # no_re_pred = np.dot(cov[:,1:],model.coefs['Estimate'][2:]) + model.coefs['Estimate'][0] + model.coefs['Estimate'][1]*g_vec residuals = np.transpose(np.asarray(residuals)) return residuals
def mixeff_multinteraction2level_model(dataframe): """ Multi-level model_5_sci includes intercept, multiple interactions and fixed effects, and setting ESCS as random on country level. :param dataframe: a data frame with student ID, school ID, country ID, science, math, reading, and other five selected variables as predictors. :return: the model results """ # one random effect and multiple interactions between gender and factors model_5_sci = Lmer( 'log_science ~ IBTEACH + WEALTH + ESCS + female + ' 'Sch_science_resource ' '+ female*ESCS ' '+ female*WEALTH + female*IBTEACH + (ESCS | CountryID)', data=dataframe) # model must be fitted in order to get estimate results model_5_sci.fit(REML=False) # print summary since auto-generated result doesn't include fixed effects print(model_5_sci.summary()) model_5_sci.plot_summary() # Visualizing random effect of a predictor model_5_sci.plot('ESCS', plot_ci=True, ylabel='Predicted log_science') sns.regplot(x='ESCS', y='residuals', data=model_5_sci.data, fit_reg=False) # Inspecting overall fit sns.regplot(x='fits', y='log_science', units='CountryID', data=model_5_sci.data, fit_reg=True) return model_5_sci
def random_intercept_3level_model(dataframe): """ Multi-level model_0_sci includes grand-mean intercept and setting outcome of log science scores as random. :param dataframe: a data frame with student ID, school ID, country ID, science, math, reading, and other five selected variables as predictors. :return: the model results """ # Random Intercept-only three-level model model_0_sci = Lmer('log_science ~ 1 | SchoolID/CountryID', data=dataframe) # model must be fitted in order to get estimate results model_0_sci.fit(REML=False) # print summary since auto-generated result doesn't include fixed effects print(model_0_sci.summary()) # plot summary model_0_sci.plot_summary() # Inspecting overall fit sns.regplot(x='fits', y='log_science', units='CountryID', data=model_0_sci.data, fit_reg=True) return model_0_sci
def test_logistic_lmm(): df = pd.read_csv(os.path.join(get_resource_path(), 'sample_data.csv')) model = Lmer('DV_l ~ IV1+ (IV1|Group)', data=df, family='binomial') model.fit(summarize=False) assert model.coefs.shape == (2, 13) estimates = np.array([-0.16098421, 0.00296261]) assert np.allclose(model.coefs['Estimate'], estimates, atol=.001) assert isinstance(model.fixef, pd.core.frame.DataFrame) assert model.fixef.shape == (47, 2) assert isinstance(model.ranef, pd.core.frame.DataFrame) assert model.ranef.shape == (47, 2) assert np.allclose(model.coefs.loc[:, 'Estimate'], model.fixef.mean(), atol=.01) # Test prediction assert np.allclose(model.predict(model.data, use_rfx=True), model.data.fits) assert np.allclose( model.predict(model.data, use_rfx=True, pred_type='link'), logit(model.data.fits))
def run_bootstrapped_eqtl_lmm_stability_one_test(expression, genotype, covariates, individuals, individual_to_cells, num_bootstraps, sampling_fraction): num_cov = covariates.shape[1] # Covariate matrix X = np.vstack((expression, individuals, genotype, covariates.T)).T # Create column names cov_names = ['cov' + str(i) for i in range(num_cov)] col_names = ['y', 'group', 'g'] + cov_names # Make df df = pd.DataFrame(X, columns=col_names) # Make formula for LMM if num_cov > 0: formula = 'y ~ g + ' + ' + '.join(cov_names) + ' + (1 | group)' else: formula = 'y ~ g + ' + '(1 | group)' bootstrapped_betas = [] for bootstrap_num in range(num_bootstraps): print(bootstrap_num) indices = get_bootstrapped_indices(individuals, individual_to_cells, sampling_fraction) model = Lmer(formula, data=df.iloc[indices, :]) model.fit() bootstrapped_beta = model.coefs['Estimate'][1] #bootstrapped_beta, bootstrapped_std_err, bootstrapped_pvalue = run_eqtl_one_test_lmm(expression[indices], genotype[indices], covariates[indices,:], individuals[indices]) bootstrapped_betas.append(bootstrapped_beta) return np.asarray(bootstrapped_betas)
def run_dynamic_eqtl_one_test_lmm(expression, genotype, covariates, groups, environmental_variable): num_cov = covariates.shape[1] # Covariate matrix X = np.vstack((expression, groups, genotype, environmental_variable, environmental_variable * genotype, covariates.T)).T # Create column names cov_names = ['cov' + str(i) for i in range(num_cov)] col_names = ['y', 'group', 'g', 'e', 'gXe'] + cov_names # Make df df = pd.DataFrame(X, columns=col_names) # Make formula for LMM if num_cov > 0: formula = 'y ~ g + e + gXe + ' + ' + '.join( cov_names) + ' + (1 | group)' else: formula = 'y ~ g + e + gXe + ' + '(1 | group)' model = Lmer(formula, data=df) model.fit() beta = model.coefs['Estimate'][3] standard_error = model.coefs['SE'][3] pvalue = model.coefs['P-val'][3] #t_value = fit['T-stat'][1] #normal_approx_p = 2.0*(1.0 - scipy.stats.norm.cdf(abs(t_value))) #residual_scale = model.ranef_var.Std[1] return pvalue
def test_gaussian_lmm(): df = pd.read_csv(os.path.join(get_resource_path(), 'sample_data.csv')) model = Lmer('DV ~ IV3 + IV2 + (IV2|Group) + (1|IV3)', data=df) model.fit(summarize=False) assert model.coefs.shape == (3, 8) estimates = np.array([12.04334602, -1.52947016, 0.67768509]) assert np.allclose(model.coefs['Estimate'], estimates, atol=.001) assert isinstance(model.fixef, list) assert model.fixef[0].shape == (47, 3) assert model.fixef[1].shape == (3, 3) assert isinstance(model.ranef, list) assert model.ranef[0].shape == (47, 2) assert model.ranef[1].shape == (3, 1) assert model.ranef_corr.shape == (1, 3) assert model.ranef_var.shape == (4, 3) assert np.allclose(model.coefs.loc[:, 'Estimate'], model.fixef[0].mean(), atol=.01) # Test prediction assert np.allclose(model.predict(model.data, use_rfx=True), model.data.fits)
def test_gaussian_lmm(): df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) model = Lmer("DV ~ IV3 + IV2 + (IV2|Group) + (1|IV3)", data=df) opt_opts = "optimizer='Nelder_Mead', optCtrl = list(FtolAbs=1e-8, XtolRel=1e-8)" model.fit(summarize=False, control=opt_opts) assert model.coefs.shape == (3, 8) estimates = np.array([12.04334602, -1.52947016, 0.67768509]) assert np.allclose(model.coefs["Estimate"], estimates, atol=0.001) assert isinstance(model.fixef, list) assert model.fixef[0].shape == (47, 3) assert model.fixef[1].shape == (3, 3) assert isinstance(model.ranef, list) assert model.ranef[0].shape == (47, 2) assert model.ranef[1].shape == (3, 1) assert model.ranef_corr.shape == (1, 3) assert model.ranef_var.shape == (4, 3) assert np.allclose(model.coefs.loc[:, "Estimate"], model.fixef[0].mean(), atol=0.01) # Test prediction assert np.allclose(model.predict(model.data, use_rfx=True), model.data.fits) # Smoketest for simulate model.simulate(2) model.simulate(2, use_rfx=True) # Smoketest for old_optimizer model.fit(summarize=False, old_optimizer=True)
def test_glmer_opt_passing(): np.random.seed(1) df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) df["DV_int"] = np.random.randint(1, 10, df.shape[0]) m = Lmer("DV_int ~ IV3 + (1|Group)", data=df, family="poisson") m.fit(summarize=False, control="optCtrl = list(FtolAbs=1e-1, FtolRel=1e-1, maxfun=10)") assert len(m.warnings) >= 1
def test_gamma_lmm(): np.random.seed(1) df = pd.read_csv(os.path.join(get_resource_path(), 'sample_data.csv')) df['DV_g'] = np.random.uniform(1, 2, size=df.shape[0]) m = Lmer('DV_g ~ IV3 + (1|Group)', data=df, family='gamma') m.fit(summarize=False) assert m.family == 'gamma' assert m.coefs.shape == (2, 7)
def test_anova(): np.random.seed(1) data = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) data["DV_l2"] = np.random.randint(0, 4, data.shape[0]) model = Lmer("DV ~ IV3*DV_l2 + (IV3|Group)", data=data) model.fit(summarize=False) out = model.anova() assert out.shape == (3, 7)
def test_inverse_gaussian_lmm(): np.random.seed(1) df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) df["DV_g"] = np.random.uniform(1, 2, size=df.shape[0]) m = Lmer("DV_g ~ IV3 + (1|Group)", data=df, family="inverse_gaussian") m.fit(summarize=False) assert m.family == "inverse_gaussian" assert m.coefs.shape == (2, 7)
def test_poisson_lmm(): np.random.seed(1) df = pd.read_csv(os.path.join(get_resource_path(), 'sample_data.csv')) df['DV_int'] = np.random.randint(1, 10, df.shape[0]) m = Lmer('DV_int ~ IV3 + (1|Group)', data=df, family='poisson') m.fit(summarize=False) assert m.family == 'poisson' assert m.coefs.shape == (2, 7) assert 'Z-stat' in m.coefs.columns
def run_bootstrapped_eqtl_stability_with_residuals_one_test_v2( expression, genotype, covariates, individuals, individual_to_cells, num_bootstraps, sampling_fraction, seed): np.random.seed(seed) #residual_expression = regress_out_covariates(expression, covariates) #residual_genotype = regress_out_covariates(genotype, covariates) # Covariate matrix num_cov = covariates.shape[1] X = np.vstack( (expression, individuals.astype(str), genotype, covariates.T)).T # Create column names cov_names = ['cov' + str(i) for i in range(num_cov)] col_names = ['y', 'group', 'g'] + cov_names # Make df df = pd.DataFrame(X, columns=col_names) # Make formula for LMM if num_cov > 0: formula = 'y ~ g + ' + ' + '.join(cov_names) + ' + (1 | group)' else: formula = 'y ~ g + ' + '(1 | group)' model = Lmer(formula, data=df) model.fit() beta = model.coefs['Estimate'][1] standard_error = model.coefs['SE'][1] eqtl_pvalue = model.coefs['P-val'][1] bp_test = het_breuschpagan(model.residuals, np.vstack(genotype)) pdb.set_trace() #X2 = sm.add_constant(X) #reg = LinearRegression().fit(X, expression) #est = sm.MixedLM(endog=expression, exog=X2, groups=individuals).fit() #est = sm.OLS(expression,X2).fit() #eqtl_pvalue = est.pvalues[1] #bp_test = het_breuschpagan(est.resid, np.vstack(genotype)) #bp_test = het_breuschpagan(est.resid, X) #white_test = het_white(est.resid,X) #print(white_test) #model = ols(expression, X) #for bootstrap_num in range(num_bootstraps): # indices = get_bootstrapped_indices(individuals, individual_to_cells, sampling_fraction) # bootstrapped_beta = run_eqtl_on_residual_expression_one_test_lm(residual_expression[indices], genotype[indices]) #bootstrapped_betas.append(bootstrapped_beta) #bootstrapped_perm_beta = run_eqtl_on_residual_expression_one_test_lm(residual_expression[indices], np.random.permutation(genotype[indices])) #bootstrapped_perm_betas.append(bootstrapped_perm_beta) #print(np.max(bootstrapped_betas) - np.min(bootstrapped_betas)) #print(np.max(bootstrapped_perm_betas) - np.min(bootstrapped_perm_betas)) #print(np.var(bootstrapped_betas)) #print(np.var(bootstrapped_perm_betas)) #print(np.mean(bootstrapped_betas)) #print(np.mean(bootstrapped_perm_betas)) return eqtl_pvalue, bp_test[3]
def test_lmer_opt_passing(): df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) model = Lmer("DV ~ IV2 + (IV2|Group)", data=df) opt_opts = "optCtrl = list(ftol_abs=1e-8, xtol_abs=1e-8)" model.fit(summarize=False, control=opt_opts) estimates = np.array([10.301072, 0.682124]) assert np.allclose(model.coefs["Estimate"], estimates, atol=0.001) assert len(model.warnings) == 0 df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) model = Lmer("DV ~ IV2 + (IV2|Group)", data=df) opt_opts = "optCtrl = list(ftol_abs=1e-4, xtol_abs=1e-4)" model.fit(summarize=False, control=opt_opts) assert len(model.warnings) >= 1
def test_post_hoc(): np.random.seed(1) df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) model = Lmer("DV ~ IV1*IV3*DV_l + (IV1|Group)", data=df, family="gaussian") model.fit( factors={"IV3": ["0.5", "1.0", "1.5"], "DV_l": ["0", "1"]}, summarize=False ) marginal, contrasts = model.post_hoc(marginal_vars="IV3", p_adjust="dunnet") assert marginal.shape[0] == 3 assert contrasts.shape[0] == 3 marginal, contrasts = model.post_hoc(marginal_vars=["IV3", "DV_l"]) assert marginal.shape[0] == 6 assert contrasts.shape[0] == 15
def get_tvals(measure, features, reverse=False): t_matrix = np.zeros((len(measure), len(stats))) p_matrix = np.zeros((len(measure), len(stats))) method_count = len(set(features['method'])) for measure_index, net_index in list( it.product(range(len(measure)), range(len(stats)))): measure_stat = measure[measure_index] net_stat = stats[net_index] # create a smaller dataframe df = features[['userID', 'topic', 'method', measure_stat, net_stat]] df = df.rename(columns={ measure_stat: 'measure_stat', net_stat: 'net_stat' }) # run model if method_count > 1: # if methods to compare model = Lmer( 'measure_stat ~ net_stat + (1 | topic ) + (1 | method)', data=df) model.fit(no_warnings=True, summarize=False) else: # no method comparison model = Lmer('measure_stat ~ net_stat + (1 | topic )', data=df) model.fit(no_warnings=True, summarize=False) # get t-vals t_val = model.coefs['T-stat']['net_stat'] if np.isnan(t_val): t_val = 0 print('Warning: no t_val found for method %s, feature %s.\ Correlation estimated at 0.') t_matrix[measure_index][net_index] = t_val # get p-val p_val = model.coefs['P-val']['net_stat'] p_matrix[measure_index][net_index] = p_val corr = pd.DataFrame(t_matrix.T, index=stats, columns=measure) return corr
def test_install(): """ Quick function to test installation by import a lmm object and fitting a quick model. """ try: from pymer4.models import Lmer from pymer4.utils import get_resource_path import os import pandas as pd import warnings warnings.filterwarnings("ignore") df = pd.read_csv(os.path.join(get_resource_path(), 'sample_data.csv')) model = Lmer('DV ~ IV3 + (1|Group)', data=df) model.fit(summarize=False) print("Pymer4 installation working successfully!") except Exception as e: print("Error! {}".format(e))
def test_post_hoc(): np.random.seed(1) df = pd.read_csv(os.path.join(get_resource_path(), 'sample_data.csv')) model = Lmer('DV ~ IV1*IV3*DV_l + (IV1|Group)', data=df, family='gaussian') model.fit(factors={ 'IV3': ['0.5', '1.0', '1.5'], 'DV_l': ['0', '1'] }, summarize=False) marginal, contrasts = model.post_hoc(marginal_vars='IV3', p_adjust='dunnet') assert marginal.shape[0] == 3 assert contrasts.shape[0] == 3 marginal, contrasts = model.post_hoc(marginal_vars=['IV3', 'DV_l']) assert marginal.shape[0] == 6 assert contrasts.shape[0] == 15
def test_logistic_lmm(): df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) model = Lmer("DV_l ~ IV1+ (IV1|Group)", data=df, family="binomial") model.fit(summarize=False) assert model.coefs.shape == (2, 13) estimates = np.array([-0.16098421, 0.00296261]) assert np.allclose(model.coefs["Estimate"], estimates, atol=0.001) assert isinstance(model.fixef, pd.core.frame.DataFrame) assert model.fixef.shape == (47, 2) assert isinstance(model.ranef, pd.core.frame.DataFrame) assert model.ranef.shape == (47, 2) assert np.allclose(model.coefs.loc[:, "Estimate"], model.fixef.mean(), atol=0.01) # Test prediction assert np.allclose(model.predict(model.data, use_rfx=True), model.data.fits) assert np.allclose( model.predict(model.data, use_rfx=True, pred_type="link"), logit(model.data.fits), ) # Test RFX only model = Lmer("DV_l ~ 0 + (IV1|Group)", data=df, family="binomial") model.fit(summarize=False) assert model.fixef.shape == (47, 2) model = Lmer("DV_l ~ 0 + (IV1|Group) + (1|IV3)", data=df, family="binomial") model.fit(summarize=False) assert isinstance(model.fixef, list) assert model.fixef[0].shape == (47, 2) assert model.fixef[1].shape == (3, 2)
def test_poisson_lmm(): np.random.seed(1) df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) df["DV_int"] = np.random.randint(1, 10, df.shape[0]) m = Lmer("DV_int ~ IV3 + (1|Group)", data=df, family="poisson") m.fit(summarize=False) assert m.family == "poisson" assert m.coefs.shape == (2, 7) assert "Z-stat" in m.coefs.columns # Test RFX only model = Lmer("DV_int ~ 0 + (IV1|Group)", data=df, family="poisson") model.fit(summarize=False) assert model.fixef.shape == (47, 2) model = Lmer("DV_int ~ 0 + (IV1|Group) + (1|IV3)", data=df, family="poisson") model.fit(summarize=False) assert isinstance(model.fixef, list) assert model.fixef[0].shape == (47, 2) assert model.fixef[1].shape == (3, 2)
def test_contrasts(): df = sns.load_dataset("gammas").rename(columns={"BOLD signal": "bold"}) grouped_means = df.groupby("ROI")["bold"].mean() model = Lmer("bold ~ ROI + (1|subject)", data=df) custom_contrast = grouped_means["AG"] - np.mean( [grouped_means["IPS"], grouped_means["V1"]]) grand_mean = grouped_means.mean() con1 = grouped_means["V1"] - grouped_means["IPS"] con2 = grouped_means["AG"] - grouped_means["IPS"] intercept = grouped_means["IPS"] # Treatment contrasts with non-alphabetic order model.fit(factors={"ROI": ["IPS", "V1", "AG"]}, summarize=False) assert np.allclose(model.coefs.loc["(Intercept)", "Estimate"], intercept) assert np.allclose(model.coefs.iloc[1, 0], con1) assert np.allclose(model.coefs.iloc[2, 0], con2) # Polynomial contrasts model.fit(factors={"ROI": ["IPS", "V1", "AG"]}, ordered=True, summarize=False) assert np.allclose(model.coefs.loc["(Intercept)", "Estimate"], grand_mean) assert np.allclose(model.coefs.iloc[1, 0], 0.870744) # From R assert np.allclose(model.coefs.iloc[2, 0], 0.609262) # From R # Custom contrasts model.fit(factors={"ROI": { "AG": 1, "IPS": -0.5, "V1": -0.5 }}, summarize=False) assert np.allclose(model.coefs.loc["(Intercept)", "Estimate"], grand_mean) assert np.allclose(model.coefs.iloc[1, 0], custom_contrast)
# Load and checkout sample data model_uid = Lmer( "base_atom_order ~ 1.0 + uid_b_a_logit + (1.0|language_family) + (1.0|Subfamily)", data=df_uid, family="binomial") model_rig = Lmer( "base_atom_order ~ 1.0 + rig_b_a_logit + (1.0|language_family) + (1.0|Subfamily)", data=df_uid, family="binomial") model_total = Lmer( "base_atom_order ~ 1.0 + uid_b_a_logit + rig_b_a_logit + (1.0|language_family) + (1.0|Subfamily)", data=df_uid, family="binomial") #model = Lmer("base_atom_order ~ rig_b_a_prob + (rig_b_a_prob|language_family) + (rig_b_a_prob|Subfamily)", data=df) model_uid_fit = model_uid.fit() model_rig_fit = model_rig.fit() model_total_fit = model_total.fit() print(model_total_fit) model_total_fit.plot_summary() assert False #table = anova_lm(model_uid.model_obj, model_rig.model_obj) #print(table) #assert False model_preds_uid = model_uid.predict(df_uid) model_preds_rig = model_rig.predict(df_rig) error_rig = model_preds_rig - df_rig["base_atom_order"] error_uid = model_preds_uid - df_uid["base_atom_order"]
# import basic libraries and sample data import os import pandas as pd from pymer4.utils import get_resource_path from pymer4.models import Lmer # IV3 is a categorical predictors with 3 levels in the sample data df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) # # We're going to fit a multi-level regression using the # categorical predictor (IV3) which has 3 levels model = Lmer("DV ~ IV3 + (1|Group)", data=df) # Using dummy-coding; suppress summary output model.fit(factors={"IV3": ["1.0", "0.5", "1.5"]}, summarize=False) # Get ANOVA table print(model.anova()) ################################################################################ # Type III SS inferences will only be valid if data are fully balanced across levels or if contrasts between levels are orthogonally coded and sum to 0. Below we tell :code:`pymer4` to respecify our contrasts to ensure this before estimating the ANOVA. :code:`pymer4` also saves the last set of contrasts used priory to forcing orthogonality. # # Because the sample data is balanced across factor levels and there are not interaction terms, in this case orthogonal contrast coding doesn't change the results. # Get ANOVA table, but this time force orthogonality # for valid SS III inferences # In this case the data are balanced so nothing changes print(model.anova(force_orthogonal=True)) ################################################################################
def test_simulate_lmm(): # Simulate some data num_obs = 50 num_coef = 3 num_grps = 100 mus = [10.0, 30.0, 2.0] coef_vals = [4.0, 1.8, -2, 10] corrs = 0.15 data, blups, b = simulate_lmm( num_obs, num_coef, num_grps, coef_vals=coef_vals, mus=mus, corrs=corrs, noise_params=(0, 0.25), seed=4, ) # Check data shape (add 2 for DV and group columns) assert data.shape == (num_obs * num_grps, num_coef + 2) # Check group shapes group_data = data.groupby("Group") assert group_data.ngroups == num_grps assert (group_data.apply(lambda grp: grp.shape == (num_obs, num_coef + 2)) ).all() # Check coefficients are as specified assert np.allclose(b, coef_vals) # Check blups are close to population values # True - Generated < .25 np.allclose(coef_vals, blups.mean(axis=0), atol=0.25) # Check column means within groups, i.e. random intercepts # True - Generated < 1.1 assert (group_data.apply(lambda grp: np.allclose( grp.iloc[:, 1:-1].mean(axis=0), mus, atol=1.1))).all() # Check correlations within group # True - Generated < .5 def grp_corr(grp): corr = grp.iloc[:, 1:-1].corr().values corr = corr[np.triu_indices(corr.shape[0], k=1)] return corr assert (group_data.apply( lambda grp: (np.abs(grp_corr(grp) - corrs) < 0.5).all())).all() # Model simulated data m = Lmer("DV ~ IV1+IV2+IV3 + (IV1+IV2+IV3|Group)", data=data) m.fit(summarize=False) # Check random effects variance # True - Generated < .25 assert np.allclose(m.ranef_var.iloc[1:-1, -1], corrs, atol=0.25) # Check parameter recovery # True - Recovered < .15 for params and < 1 for intercept assert (np.abs(m.coefs.iloc[1:, 0] - b[1:]) < 0.15).all() assert (np.abs(m.coefs.iloc[0, 0] - b[0]) < 1).all() # Check BLUP recovery # mean(True - Generated) < .5 (sigma) assert np.abs((m.fixef.values - blups.values).ravel()).mean() < 0.5
def run_models( model_data=r'C:\Users\K1774755\Downloads\phd\mmse_rebecca\mmse_synthetic_data_20190919.xlsx', to_predict='score_combined', key='brcid', covariates=None, timestamps=('score_date_centered', ), complete_case=False, models=('linear_rdn_int', 'linear_rdn_all_no_intercept', 'linear_rdn_all', 'quadratic_rdn_int'), output_file_path=None): if isinstance(model_data, str) and 'xlsx' in model_data: # load regression data model_data = pd.read_excel(model_data, index_col=None) if covariates is not None: # check covariates actually exist in the model data if not all(elem in model_data.columns for elem in list(covariates)): print('covariates entered do not exist in input data') return pd.DataFrame( {'output': 'failure - covariates not in input data'}, index=[0]) if complete_case: print('all cases:', len(model_data), 'observations, ', len(model_data[key].unique()), 'patients') model_data = model_data.replace({ 'not known': np.nan, 'Not Known': np.nan, 'unknown': np.nan, 'Unknown': np.nan, '[nan-nan]': np.nan }) model_data = model_data.dropna(subset=list(covariates), how='any') print('only complete cases:', len(model_data), 'observations, ', len(model_data[key].unique()), 'patients') if output_file_path is not None: st = datetime.datetime.fromtimestamp( time.time()).strftime('%Y%m%d-%Hh%M') writer = pd.ExcelWriter(output_file_path.replace( '.xlsx', st + '.xlsx'), engine='xlsxwriter') res = [] col_num = 0 for patient_group in list( model_data.patient_diagnosis_super_class.unique()): df_tmp = model_data[model_data.patient_diagnosis_super_class == patient_group] \ if patient_group != 'all' else model_data row_num = 0 for ts in timestamps: for m in models: print('running model:', m, '(patient group:', patient_group, ', timestamp:', ts, ')') formula = lmer_formula(model_type=m, regressor=to_predict, timestamp=ts, covariates=covariates, group=key) print('using formula', formula) model = Lmer(formula, data=df_tmp) try: model.fit(REML=True) if model.warnings is not None: # try unrestricted MLE if convergence failed model.fit(REML=False) to_print = print_r_model_output(model) except: print('something went wrong with model fitting') to_print = pd.DataFrame({'output': 'failure'}, index=[0]) to_print = pd.concat([to_print], keys=[patient_group], names=[m]) if output_file_path is not None: to_print.to_excel(writer, startrow=row_num, startcol=col_num) row_num += 2 + len(to_print) else: res = res.append(to_print) if output_file_path is not None: col_num += to_print.shape[1] + 3 if output_file_path is not None: writer.save() return res
x0 = linregress(np.linspace(0, 1, 30), curve).intercept #x0 = curve[:15].mean() curve = curve/x0 - 1 y_df = y_df.append(pd.DataFrame({'metric_type':metric_type, 'fb_type': fb_type, 'subj_id': 's'+str(subj_id), 'channel': ch, 'k': np.linspace(0, 1, 30), 'env': curve+0.0001, 'band': band}), ignore_index=True) from pymer4.models import Lm, Lmer from pymer4.utils import get_resource_path for b, band in enumerate(['alpha', 'beta', 'theta']): for c, ch in enumerate(CHANNELS): for m, metric_type in enumerate(['magnitude', 'n_spindles', 'duration', 'amplitude']): data = y_df.query('metric_type=="{}" & channel=="{}" & band=="{}"'.format(metric_type, ch, band)) model = Lmer('env ~ k:fb_type + (1 |subj_id)', data=data, ) model.fit(factors={'fb_type': ['FB0', 'FB250', 'FB500', 'FBMock']}) a = model.post_hoc('k', 'fb_type')[1] a['channel'] = ch a['metric_type'] = metric_type a['band'] = band a['P-val-full'] = stats.t.sf(a['T-stat'], 9) if c==0 and m==0 and b==0: all_stats_df = a.copy() else: all_stats_df = all_stats_df.append(a, ignore_index=True) print(ch, metric_type) from mne.stats import fdr_correction data = np.zeros((3, 6, 4, 32))
def test_gaussian_lmm(): df = pd.read_csv(os.path.join(get_resource_path(), "sample_data.csv")) model = Lmer("DV ~ IV3 + IV2 + (IV2|Group) + (1|IV3)", data=df) opt_opts = "optimizer='Nelder_Mead', optCtrl = list(FtolAbs=1e-8, XtolRel=1e-8)" model.fit(summarize=False, control=opt_opts) assert model.coefs.shape == (3, 8) estimates = np.array([12.04334602, -1.52947016, 0.67768509]) assert np.allclose(model.coefs["Estimate"], estimates, atol=0.001) assert isinstance(model.fixef, list) assert (model.fixef[0].index.astype(int) == df.Group.unique()).all() assert (model.fixef[1].index.astype(float) == df.IV3.unique()).all() assert model.fixef[0].shape == (47, 3) assert model.fixef[1].shape == (3, 3) assert isinstance(model.ranef, list) assert model.ranef[0].shape == (47, 2) assert model.ranef[1].shape == (3, 1) assert (model.ranef[1].index == ["0.5", "1", "1.5"]).all() assert model.ranef_corr.shape == (1, 3) assert model.ranef_var.shape == (4, 3) assert np.allclose(model.coefs.loc[:, "Estimate"], model.fixef[0].mean(), atol=0.01) # Test prediction assert np.allclose(model.predict(model.data, use_rfx=True), model.data.fits) # Test simulate out = model.simulate(2) assert isinstance(out, pd.DataFrame) assert out.shape == (model.data.shape[0], 2) out = model.simulate(2, use_rfx=True) assert isinstance(out, pd.DataFrame) assert out.shape == (model.data.shape[0], 2) # Smoketest for old_optimizer model.fit(summarize=False, old_optimizer=True) # test fixef code for 1 fixed effect model = Lmer("DV ~ IV3 + IV2 + (IV2|Group)", data=df) model.fit(summarize=False, control=opt_opts) assert (model.fixef.index.astype(int) == df.Group.unique()).all() assert model.fixef.shape == (47, 3) assert np.allclose(model.coefs.loc[:, "Estimate"], model.fixef.mean(), atol=0.01) # test fixef code for 0 fixed effects model = Lmer("DV ~ (IV2|Group) + (1|IV3)", data=df) model.fit(summarize=False, control=opt_opts) assert isinstance(model.fixef, list) assert (model.fixef[0].index.astype(int) == df.Group.unique()).all() assert (model.fixef[1].index.astype(float) == df.IV3.unique()).all() assert model.fixef[0].shape == (47, 2) assert model.fixef[1].shape == (3, 2)