def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) # test the names assert_(mod1.data.xnames == ["x1", "x2", "x3", "x4"]) assert_(mod1.data.exog_re_names == ["x_re1"]) assert_(mod1.data.exog_re_names_full == ["x_re1 RE"]) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) assert_(mod2.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod2.data.exog_re_names == ["exog_re"]) assert_(mod2.data.exog_re_names_full == ["exog_re RE"]) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") assert_(mod3.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod3.data.exog_re_names == ["exog_re"]) assert_(mod3.data.exog_re_names_full == ["exog_re RE"]) rslt3 = mod3.fit(start_params=rslt2.params) assert_allclose(rslt1.params, rslt3.params, rtol=1e-4) # Check default variance structure with non-formula model # creation, also use different exog_re that produces a zero # estimated variance parameter. exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt4 = mod4.fit() from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") assert_(mod5.data.exog_re_names == ["groups"]) assert_(mod5.data.exog_re_names_full == ["groups RE"]) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt5 = mod5.fit() assert_almost_equal(rslt4.params, rslt5.params)
def test_sparse(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, '#1lab_results') fname = os.path.join(rdir, 'pastes.csv') # Dense data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() # Sparse model2 = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, use_sparse=True, data=data) result2 = model2.fit() assert_allclose(result.params, result2.params) assert_allclose(result.bse, result2.bse)
def test_summary_col(): from statsmodels.iolib.summary2 import summary_col ids = [1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] # hard coded simulated y # ids = np.asarray(ids) # np.random.seed(123987) # y = x + np.array([-1, 0, 1])[ids - 1] + 2 * np.random.randn(len(y)) y = np.array([ 1.727, -1.037, 2.904, 3.569, 4.629, 5.736, 6.747, 7.020, 5.624, 10.155, 10.400, 17.164, 17.276, 14.988, 14.453 ]) d = {'Y': y, 'X': x, 'IDS': ids} d = pd.DataFrame(d) # provide start_params to speed up convergence sp1 = np.array([-1.26722599, 1.1617587, 0.19547518]) mod1 = MixedLM.from_formula('Y ~ X', d, groups=d['IDS']) results1 = mod1.fit(start_params=sp1) sp2 = np.array([3.48416861, 0.55287862, 1.38537901]) mod2 = MixedLM.from_formula('X ~ Y', d, groups=d['IDS']) results2 = mod2.fit(start_params=sp2) out = summary_col([results1, results2], stars=True) s = ('\n=============================\n Y X \n' '-----------------------------\nGroup Var 0.1955 1.3854 \n' ' (0.6032) (2.7377) \nIntercept -1.2672 3.4842* \n' ' (1.6546) (1.8882) \nX 1.1618*** \n' ' (0.1959) \nY 0.5529***\n' ' (0.2080) \n=============================\n' 'Standard errors in\nparentheses.\n* p<.1, ** p<.05, ***p<.01') assert_equal(str(out), s)
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) # test the names assert_(mod1.data.xnames == ["x1", "x2", "x3", "x4"]) assert_(mod1.data.exog_re_names == ["x_re1"]) assert_(mod1.data.exog_re_names_full == ["x_re1 RE"]) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) assert_(mod2.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod2.data.exog_re_names == ["exog_re"]) assert_(mod2.data.exog_re_names_full == ["exog_re RE"]) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") assert_(mod3.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod3.data.exog_re_names == ["exog_re"]) assert_(mod3.data.exog_re_names_full == ["exog_re RE"]) rslt3 = mod3.fit(start_params=rslt2.params) assert_allclose(rslt1.params, rslt3.params, rtol=1e-4) # Check default variance structure with non-formula model # creation, also use different exog_re that produces a zero # estimated variance parameter. exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt4 = mod4.fit() from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") assert_(mod5.data.exog_re_names == ["groups"]) assert_(mod5.data.exog_re_names_full == ["groups RE"]) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt5 = mod5.fit() assert_almost_equal(rslt4.params, rslt5.params)
def test_sparse(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'pastes.csv') # Dense data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula( "strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() # Sparse model2 = MixedLM.from_formula( "strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, use_sparse=True, data=data) result2 = model2.fit() assert_allclose(result.params, result2.params) assert_allclose(result.bse, result2.bse)
def test_summary_col(): from statsmodels.iolib.summary2 import summary_col ids = [1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3] x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] # hard coded simulated y # ids = np.asarray(ids) # np.random.seed(123987) # y = x + np.array([-1, 0, 1])[ids - 1] + 2 * np.random.randn(len(y)) y = np.array([ 1.727, -1.037, 2.904, 3.569, 4.629, 5.736, 6.747, 7.020, 5.624, 10.155, 10.400, 17.164, 17.276, 14.988, 14.453 ]) d = {'Y': y, 'X': x, 'IDS': ids} d = pd.DataFrame(d) # provide start_params to speed up convergence sp1 = np.array([-1.26722599, 1.1617587, 0.19547518]) mod1 = MixedLM.from_formula('Y ~ X', d, groups=d['IDS']) results1 = mod1.fit(start_params=sp1) sp2 = np.array([3.48416861, 0.55287862, 1.38537901]) mod2 = MixedLM.from_formula('X ~ Y', d, groups=d['IDS']) results2 = mod2.fit(start_params=sp2) out = summary_col([results1, results2], stars=True) s = ('\n=============================\n Y X \n' '-----------------------------\nGroup Var 0.1955 1.3854 \n' ' (0.6032) (2.7377) \nIntercept -1.2672 3.4842* \n' ' (1.6546) (1.8882) \nX 1.1618*** \n' ' (0.1959) \nY 0.5529***\n' ' (0.2080) \n=============================\n' 'Standard errors in\nparentheses.\n* p<.1, ** p<.05, ***p<.01') assert_equal(str(out), s)
def test_sparse(self): import scipy v = scipy.__version__.split(".")[1] v = int(v) if v < 16: return cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, "results") fname = os.path.join(rdir, "pastes.csv") # Dense data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() # Sparse model2 = MixedLM.from_formula( "strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, use_sparse=True, data=data ) result2 = model2.fit() assert_allclose(result.params, result2.params) assert_allclose(result.bse, result2.bse)
def calcBetaLme(data_full, gain_full, loss_full, linear_full, quad_full, run_group, thrshd=None): """ function to calculate beta parameters. Input: data from bold file, two list of gain, loss regressor values dummy variable indicating the groups, a threshold to idenfity the voxels inside the brain Output: beta coefficient, the corresponding p-values, the convergence information """ T = data_full.shape[-1] time_by_vox = np.reshape(data_full, (-1, T)).T beta = np.empty([time_by_vox.shape[1],5]) fml = "bold ~ gain + loss" for k in np.arange(0,time_by_vox.shape[1]): ## set a threshold to idenfity the voxels inside the brain if thrshd != None: if (np.mean(time_by_vox[:,k]) <= thrshd): beta[k, :] = [0, 0, 0, 0, 0] else: dt = pd.DataFrame({'gain':gain_full,'loss':loss_full,'run_group':run_group, 'ldrift':linear_full,'qdrift':quad_full,'bold':time_by_vox[:,k]}) mod_lme = MixedLM.from_formula(fml, dt, groups=dt["run_group"]) lme_result = mod_lme.fit() beta[k, :] = [lme_result.fe_params["gain"], lme_result.pvalues["gain"], lme_result.fe_params["loss"], lme_result.pvalues["loss"], lme_result.converged] else: dt = pd.DataFrame({'gain':gain_full,'loss':loss_full,'run_group':run_group, 'ldrift':linear_full,'qdrift':quad_full,'bold':time_by_vox[:,k]}) mod_lme = MixedLM.from_formula(fml, dt, groups=dt["run_group"]) lme_result = mod_lme.fit() beta[k, :] = [lme_result.fe_params["gain"], lme_result.pvalues["gain"], lme_result.fe_params["loss"], lme_result.pvalues["loss"], lme_result.converged] return beta
def test_sparse(self): import scipy v = scipy.__version__.split(".")[1] v = int(v) if v < 16: return cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'pastes.csv') # Dense data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() # Sparse model2 = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, use_sparse=True, data=data) result2 = model2.fit() assert_allclose(result.params, result2.params) assert_allclose(result.bse, result2.bse)
def test_handle_missing(): np.random.seed(23423) df = np.random.normal(size=(100, 6)) df = pd.DataFrame(df) df.columns = ["y", "g", "x1", "z1", "c1", "c2"] df["g"] = np.kron(np.arange(50), np.ones(2)) re = np.random.normal(size=(50, 4)) re = np.kron(re, np.ones((2, 1))) df["y"] = re[:, 0] + re[:, 1] * df.z1 + re[:, 2] * df.c1 df["y"] += re[:, 3] * df.c2 + np.random.normal(size=100) df.loc[1, "y"] = np.NaN df.loc[2, "g"] = np.NaN df.loc[3, "x1"] = np.NaN df.loc[4, "z1"] = np.NaN df.loc[5, "c1"] = np.NaN df.loc[6, "c2"] = np.NaN fml = "y ~ x1" re_formula = "1 + z1" vc_formula = {"a": "0 + c1", "b": "0 + c2"} for include_re in False, True: for include_vc in False, True: kwargs = {} dx = df.copy() va = ["y", "g", "x1"] if include_re: kwargs["re_formula"] = re_formula va.append("z1") if include_vc: kwargs["vc_formula"] = vc_formula va.extend(["c1", "c2"]) dx = dx[va].dropna() # Some of these models are severely misspecified with # small n, so produce convergence warnings. Not relevant # to what we are checking here. with warnings.catch_warnings(): warnings.simplefilter("ignore") # Drop missing externally model1 = MixedLM.from_formula(fml, groups="g", data=dx, **kwargs) result1 = model1.fit() # MixeLM handles missing model2 = MixedLM.from_formula(fml, groups="g", data=df, missing='drop', **kwargs) result2 = model2.fit() assert_allclose(result1.params, result2.params) assert_allclose(result1.bse, result2.bse) assert_equal(len(result1.fittedvalues), result1.nobs)
def test_dietox_slopes(self): # dietox data from geepack using random intercepts # # Fit in R using # # library(geepack) # r = lmer(Weight ~ Time + (1 + Time | Pig), data=dietox) # r = lmer(Weight ~ Time + (1 + Time | Pig), REML=FALSE, data=dietox) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'dietox.csv') # REML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", re_formula="1 + Time", data=data) result = model.fit(method='powell') # fixef(r) assert_allclose(result.fe_params, np.r_[15.738650, 6.939014], rtol=1e-5) # sqrt(diag(vcov(r))) assert_allclose(result.bse[0:2], np.r_[0.5501253, 0.0798254], rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 6.03745, rtol=1e-3) # as.numeric(VarCorr(r)[[1]]) assert_allclose(result.cov_re.values.ravel(), np.r_[19.4934552, 0.2938323, 0.2938323, 0.4160620], rtol=1e-1) # logLik(r) assert_allclose(model.loglike(result.params_object), -2217.047, rtol=1e-5) # ML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", re_formula="1 + Time", data=data) result = model.fit(method='powell', reml=False) # fixef(r) assert_allclose(result.fe_params, np.r_[15.73863, 6.93902], rtol=1e-5) # sqrt(diag(vcov(r))) assert_allclose(result.bse[0:2], np.r_[0.54629282, 0.07926954], rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 6.037441, rtol=1e-3) # as.numeric(VarCorr(r)[[1]]) assert_allclose(result.cov_re.values.ravel(), np.r_[19.190922, 0.293568, 0.293568, 0.409695], rtol=1e-2) # logLik(r) assert_allclose(model.loglike(result.params_object), -2215.753, rtol=1e-5)
def test_dietox_slopes(self): # dietox data from geepack using random intercepts # # Fit in R using # # library(geepack) # r = lmer(Weight ~ Time + (1 + Time | Pig), data=dietox) # r = lmer(Weight ~ Time + (1 + Time | Pig), REML=FALSE, data=dietox) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'dietox.csv') # REML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", re_formula="1 + Time", data=data) result = model.fit(method='powell') # fixef(r) assert_allclose(result.fe_params, np.r_[15.738650, 6.939014], rtol=1e-5) # sqrt(diag(vcov(r))) assert_allclose(result.bse[0:2], np.r_[0.5501253, 0.0798254], rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 6.03745, rtol=1e-3) # as.numeric(VarCorr(r)[[1]]) assert_allclose(result.cov_re.values.ravel(), np.r_[19.4934552, 0.2938323, 0.2938323, 0.4160620], rtol=1e-1) # logLik(r) assert_allclose(model.loglike(result.params_object), -2217.047, rtol=1e-5) # ML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", re_formula="1 + Time", data=data) result = model.fit(method='powell', reml=False) # fixef(r) assert_allclose(result.fe_params, np.r_[15.73863, 6.93902], rtol=1e-5) # sqrt(diag(vcov(r))) assert_allclose(result.bse[0:2], np.r_[0.54629282, 0.07926954], rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 6.037441, rtol=1e-3) # as.numeric(VarCorr(r)[[1]]) assert_allclose(result.cov_re.values.ravel(), np.r_[19.190922, 0.293568, 0.293568, 0.409695], rtol=1e-2) # logLik(r) assert_allclose(model.loglike(result.params_object), -2215.753, rtol=1e-5)
def test_dietox(self): # dietox data from geepack using random intercepts # # Fit in R using # # library(geepack) # rm = lmer(Weight ~ Time + (1 | Pig), data=dietox) # rm = lmer(Weight ~ Time + (1 | Pig), REML=FALSE, data=dietox) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'dietox.csv') # REML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", data=data) result = model.fit() # fixef(rm) assert_allclose(result.fe_params, np.r_[15.723523, 6.942505], rtol=1e-5) # sqrt(diag(vcov(rm))) assert_allclose(result.bse[0:2], np.r_[0.78805374, 0.03338727], rtol=1e-5) # attr(VarCorr(rm), "sc")^2 assert_allclose(result.scale, 11.36692, rtol=1e-5) # VarCorr(rm)[[1]][[1]] assert_allclose(result.cov_re, 40.39395, rtol=1e-5) # logLik(rm) assert_allclose(model.loglike(result.params_object), -2404.775, rtol=1e-5) # ML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", data=data) result = model.fit(reml=False) # fixef(rm) assert_allclose(result.fe_params, np.r_[15.723517, 6.942506], rtol=1e-5) # sqrt(diag(vcov(rm))) assert_allclose(result.bse[0:2], np.r_[0.7829397, 0.0333661], rtol=1e-5) # attr(VarCorr(rm), "sc")^2 assert_allclose(result.scale, 11.35251, rtol=1e-5) # VarCorr(rm)[[1]][[1]] assert_allclose(result.cov_re, 39.82097, rtol=1e-5) # logLik(rm) assert_allclose(model.loglike(result.params_object), -2402.932, rtol=1e-5)
def test_dietox(self): # dietox data from geepack using random intercepts # # Fit in R using # # library(geepack) # rm = lmer(Weight ~ Time + (1 | Pig), data=dietox) # rm = lmer(Weight ~ Time + (1 | Pig), REML=FALSE, data=dietox) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'dietox.csv') # REML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", data=data) result = model.fit() # fixef(rm) assert_allclose(result.fe_params, np.r_[15.723523, 6.942505], rtol=1e-5) # sqrt(diag(vcov(rm))) assert_allclose(result.bse[0:2], np.r_[0.78805374, 0.03338727], rtol=1e-5) # attr(VarCorr(rm), "sc")^2 assert_allclose(result.scale, 11.36692, rtol=1e-5) # VarCorr(rm)[[1]][[1]] assert_allclose(result.cov_re, 40.39395, rtol=1e-5) # logLik(rm) assert_allclose(model.loglike(result.params_object), -2404.775, rtol=1e-5) # ML data = pd.read_csv(fname) model = MixedLM.from_formula("Weight ~ Time", groups="Pig", data=data) result = model.fit(reml=False) # fixef(rm) assert_allclose(result.fe_params, np.r_[15.723517, 6.942506], rtol=1e-5) # sqrt(diag(vcov(rm))) assert_allclose(result.bse[0:2], np.r_[0.7829397, 0.0333661], rtol=1e-5) # attr(VarCorr(rm), "sc")^2 assert_allclose(result.scale, 11.35251, rtol=1e-5) # VarCorr(rm)[[1]][[1]] assert_allclose(result.cov_re, 39.82097, rtol=1e-5) # logLik(rm) assert_allclose(model.loglike(result.params_object), -2402.932, rtol=1e-5)
def test_pastes_vcomp(self): """ pastes data from lme4 Fit in R using formula: strength ~ (1|batch) + (1|batch:cask) """ cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'pastes.csv') # REML data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3) assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3) assert_allclose(result.scale, 0.678, rtol=1e-3) assert_allclose(result.llf, -123.49, rtol=1e-1) assert_equal(result.aic, np.nan) # don't provide aic/bic with REML assert_equal(result.bic, np.nan) resid = np.r_[0.17133538, -0.02866462, -1.08662875, 1.11337125, -0.12093607] assert_allclose(result.resid[0:5], resid, rtol=1e-3) fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094] assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4) # ML data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit(reml=False) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3) assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3) assert_allclose(result.scale, 0.67799, rtol=1e-3) assert_allclose(result.llf, -123.997, rtol=1e-1) assert_allclose(result.aic, 255.9944, rtol=1e-3) assert_allclose(result.bic, 264.3718, rtol=1e-3)
def test_handle_missing(): np.random.seed(23423) df = np.random.normal(size=(100, 6)) df = pd.DataFrame(df) df.columns = ["y", "g", "x1", "z1", "c1", "c2"] df["g"] = np.kron(np.arange(50), np.ones(2)) re = np.random.normal(size=(50, 4)) re = np.kron(re, np.ones((2, 1))) df["y"] = re[:, 0] + re[:, 1] * df.z1 + re[:, 2] * df.c1 df["y"] += re[:, 3] * df.c2 + np.random.normal(size=100) df.loc[1, "y"] = np.NaN df.loc[2, "g"] = np.NaN df.loc[3, "x1"] = np.NaN df.loc[4, "z1"] = np.NaN df.loc[5, "c1"] = np.NaN df.loc[6, "c2"] = np.NaN fml = "y ~ x1" re_formula = "1 + z1" vc_formula = {"a": "0 + c1", "b": "0 + c2"} for include_re in False, True: for include_vc in False, True: kwargs = {} dx = df.copy() va = ["y", "g", "x1"] if include_re: kwargs["re_formula"] = re_formula va.append("z1") if include_vc: kwargs["vc_formula"] = vc_formula va.extend(["c1", "c2"]) dx = dx[va].dropna() # Some of these models are severely misspecified with # small n, so produce convergence warnings. Not relevant # to what we are checking here. with warnings.catch_warnings(): warnings.simplefilter("ignore") # Drop missing externally model1 = MixedLM.from_formula( fml, groups="g", data=dx, **kwargs) result1 = model1.fit() # MixeLM handles missing model2 = MixedLM.from_formula( fml, groups="g", data=df, missing='drop', **kwargs) result2 = model2.fit() assert_allclose(result1.params, result2.params) assert_allclose(result1.bse, result2.bse) assert_equal(len(result1.fittedvalues), result1.nobs)
def test_pastes_vcomp(self): """ pastes data from lme4 Fit in R using formula: strength ~ (1|batch) + (1|batch:cask) """ cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'pastes.csv') # REML data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3) assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3) assert_allclose(result.scale, 0.678, rtol=1e-3) assert_allclose(result.llf, -123.49, rtol=1e-1) assert_equal(result.aic, np.nan) # don't provide aic/bic with REML assert_equal(result.bic, np.nan) resid = np.r_[0.17133538, -0.02866462, - 1.08662875, 1.11337125, -0.12093607] assert_allclose(result.resid[0:5], resid, rtol=1e-3) fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094] assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4) # ML data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit(reml=False) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3) assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3) assert_allclose(result.scale, 0.67799, rtol=1e-3) assert_allclose(result.llf, -123.997, rtol=1e-1) assert_allclose(result.aic, 255.9944, rtol=1e-3) assert_allclose(result.bic, 264.3718, rtol=1e-3)
def test_vcomp_3(self): # Test a model with vcomp but no other random effects, using formulas. import scipy v = scipy.__version__.split(".")[1] v = int(v) if v < 16: return np.random.seed(4279) x1 = np.random.normal(size=400) groups = np.kron(np.arange(100), np.ones(4)) slopes = np.random.normal(size=100) slopes = np.kron(slopes, np.ones(4)) * x1 y = slopes + np.random.normal(size=400) vc_fml = {"a": "0 + x1"} df = pd.DataFrame({"y": y, "x1": x1, "groups": groups}) model = MixedLM.from_formula("y ~ 1", groups="groups", vc_formula=vc_fml, data=df) result = model.fit() result.summary() assert_allclose(result.resid.iloc[0:4], np.r_[-1.180753, 0.279966, 0.578576, -0.667916], rtol=1e-3) assert_allclose(result.fittedvalues.iloc[0:4], np.r_[-0.101549, 0.028613, -0.224621, -0.126295], rtol=1e-3)
def calcBetaLme(data_full, gain_full, loss_full, linear_full, quad_full, run_group, thrshd): """ function to calculate beta parameters. Input: data from bold file, two list of gain, loss regressor values dummy variable indicating the groups, a threshold to idenfity the voxels inside the brain Output: beta coefficient, the corresponding p-values, the convergence information """ T = data_full.shape[-1] time_by_vox = np.reshape(data_full, (-1, T)).T beta = np.empty([time_by_vox.shape[1], 5]) fml = "bold ~ gain + loss" for k in np.arange(0, time_by_vox.shape[1]): ## set a threshold to idenfity the voxels inside the brain if (np.mean(time_by_vox[:, k]) <= 400): beta[k, :] = [0, 0, 0, 0, 0] else: dt = pd.DataFrame({ 'gain': gain_full, 'loss': loss_full, 'run_group': run_group, 'ldrift': linear_full, 'qdrift': quad_full, 'bold': time_by_vox[:, k] }) mod_lme = MixedLM.from_formula(fml, dt, groups=dt["run_group"]) lme_result = mod_lme.fit() beta[k, :] = [ lme_result.fe_params["gain"], lme_result.pvalues["gain"], lme_result.fe_params["loss"], lme_result.pvalues["loss"], lme_result.converged ] return beta
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Check default variance structure, with formula.api exog_re = np.ones(len(endog), dtype=np.float64) mod3 = MixedLM(endog, exog, groups, exog_re) rslt3 = mod3.fit() from statsmodels.formula.api import mixedlm mod4 = mixedlm(fml, df, groups=groups) rslt4 = mod4.fit() assert_almost_equal(rslt3.params, rslt4.params)
def test_vcomp_3(self): # Test a model with vcomp but no other random effects, using formulas. import scipy v = scipy.__version__.split(".")[1] v = int(v) if v < 16: return np.random.seed(4279) x1 = np.random.normal(size=400) groups = np.kron(np.arange(100), np.ones(4)) slopes = np.random.normal(size=100) slopes = np.kron(slopes, np.ones(4)) * x1 y = slopes + np.random.normal(size=400) vc_fml = {"a": "0 + x1"} df = pd.DataFrame({"y": y, "x1": x1, "groups": groups}) model = MixedLM.from_formula("y ~ 1", groups="groups", vc_formula=vc_fml, data=df) result = model.fit() result.summary() assert_allclose(result.resid.iloc[0:4], np.r_[-1.180753, 0.279966, 0.578576, -0.667916], rtol=1e-3) assert_allclose(result.fittedvalues.iloc[0:4], np.r_[-0.101549, 0.028613, -0.224621, -0.126295], rtol=1e-3)
def lmemodel(data, metadata, fixedEffects = ['Tissue of Origin'], randomEffects=['High Confidence Donor ID (HCDID)']): """Performs a mixed effect linear model""" df = metadata[fixedEffects].copy() df = pd.concat([df, metadata[randomEffects]], axis=1 ) #Change the parameters to be compatible with patsy formulas fixedEffects = [c.translate(string.maketrans(' ()', '___')) for c in fixedEffects] randomEffects = [c.translate(string.maketrans(' ()', '___')) for c in randomEffects] df.columns = [c.translate(string.maketrans(' ()', '___')) for c in df.columns] model_string = 'gene ~ '+' + '.join(fixedEffects) results = [] for i in range(data.shape[0]): #Add the dependent variable to the dataframe df['gene'] = data.irow(i) ################# df['High_Confidence_Donor_ID__HCDID_'] = stats.binom.rvs(1, .4, size=69) print df.shape, model_string return df df = df.dropna() print df.shape ################# #compute new model mod = MixedLM.from_formula(model_string, df, groups = df[randomEffects]) return mod #df.boxplot(by=fixedEffects) #mod = sm.ols(model_fit, df) #results.append(mod.fit()) return results
def test_vcomp_2(self): # Simulated data comparison to R np.random.seed(6241) n = 1600 exog = np.random.normal(size=(n, 2)) groups = np.kron(np.arange(n / 16), np.ones(16)) # Build up the random error vector errors = 0 # The random effects exog_re = np.random.normal(size=(n, 2)) slopes = np.random.normal(size=(n // 16, 2)) slopes = np.kron(slopes, np.ones((16, 1))) * exog_re errors += slopes.sum(1) # First variance component subgroups1 = np.kron(np.arange(n / 4), np.ones(4)) errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4)) # Second variance component subgroups2 = np.kron(np.arange(n / 2), np.ones(2)) errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2)) # iid errors errors += np.random.normal(size=n) endog = exog.sum(1) + errors df = pd.DataFrame(index=range(n)) df["y"] = endog df["groups"] = groups df["x1"] = exog[:, 0] df["x2"] = exog[:, 1] df["z1"] = exog_re[:, 0] df["z2"] = exog_re[:, 1] df["v1"] = subgroups1 df["v2"] = subgroups2 # Equivalent model in R: # df.to_csv("tst.csv") # model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 | # v2), df) vcf = {"a": "0 + C(v1)", "b": "0 + C(v2)"} model1 = MixedLM.from_formula("y ~ x1 + x2", groups=groups, re_formula="0+z1+z2", vc_formula=vcf, data=df) result1 = model1.fit() # Compare to R assert_allclose(result1.fe_params, [ 0.16527, 0.99911, 0.96217], rtol=1e-4) assert_allclose(result1.cov_re, [ [1.244, 0.146], [0.146, 1.371]], rtol=1e-3) assert_allclose(result1.vcomp, [4.024, 3.997], rtol=1e-3) assert_allclose(result1.bse.iloc[0:3], [ 0.12610, 0.03938, 0.03848], rtol=1e-3)
def test_vcomp_formula(self): np.random.seed(6241) n = 800 exog = np.random.normal(size=(n, 2)) exog[:, 0] = 1 ex_vc = [] groups = np.kron(np.arange(n / 4), np.ones(4)) errors = 0 exog_re = np.random.normal(size=(n, 2)) slopes = np.random.normal(size=(n // 4, 2)) slopes = np.kron(slopes, np.ones((4, 1))) * exog_re errors += slopes.sum(1) ex_vc = np.random.normal(size=(n, 4)) slopes = np.random.normal(size=(n // 4, 4)) slopes[:, 2:] *= 2 slopes = np.kron(slopes, np.ones((4, 1))) * ex_vc errors += slopes.sum(1) errors += np.random.normal(size=n) endog = exog.sum(1) + errors exog_vc = {"a": {}, "b": {}} for k, group in enumerate(range(int(n / 4))): ix = np.flatnonzero(groups == group) exog_vc["a"][group] = ex_vc[ix, 0:2] exog_vc["b"][group] = ex_vc[ix, 2:] with pytest.warns(UserWarning, match="Using deprecated variance"): model1 = MixedLM(endog, exog, groups, exog_re=exog_re, exog_vc=exog_vc) result1 = model1.fit() df = pd.DataFrame(exog[:, 1:], columns=["x1"]) df["y"] = endog df["re1"] = exog_re[:, 0] df["re2"] = exog_re[:, 1] df["vc1"] = ex_vc[:, 0] df["vc2"] = ex_vc[:, 1] df["vc3"] = ex_vc[:, 2] df["vc4"] = ex_vc[:, 3] vc_formula = {"a": "0 + vc1 + vc2", "b": "0 + vc3 + vc4"} model2 = MixedLM.from_formula("y ~ x1", groups=groups, re_formula="0 + re1 + re2", vc_formula=vc_formula, data=df) result2 = model2.fit() assert_allclose(result1.fe_params, result2.fe_params, rtol=1e-8) assert_allclose(result1.cov_re, result2.cov_re, rtol=1e-8) assert_allclose(result1.vcomp, result2.vcomp, rtol=1e-8) assert_allclose(result1.params, result2.params, rtol=1e-8) assert_allclose(result1.bse, result2.bse, rtol=1e-8)
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300,4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1,1,1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1,1,1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:,k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") rslt3 = mod3.fit(start_params=rslt2.params) assert_allclose(rslt1.params, rslt3.params, rtol=1e-4) # Check default variance structure with non-formula model # creation. exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) rslt4 = mod4.fit(start_params=rslt2.params) from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") rslt5 = mod5.fit(start_params=rslt2.params) assert_almost_equal(rslt4.params, rslt5.params)
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") rslt3 = mod3.fit(start_params=rslt2.params) assert_almost_equal(rslt1.params, rslt3.params, decimal=5) # Check default variance structure with formula.api exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) rslt4 = mod4.fit(start_params=rslt2.params) from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") rslt5 = mod5.fit(start_params=rslt2.params) assert_almost_equal(rslt4.params, rslt5.params)
def test_mixed_lm_wrapper(): # a bit more complicated model to test np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "~ exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) result = mod2.fit() result.summary() xnames = ["exog0", "exog1", "exog2", "exog3"] re_names = ["Intercept", "exog_re"] re_names_full = ["Intercept RE", "Intercept RE x exog_re RE", "exog_re RE"] assert_(mod2.data.xnames == xnames) assert_(mod2.data.exog_re_names == re_names) assert_(mod2.data.exog_re_names_full == re_names_full) params = result.params assert_(params.index.tolist() == xnames + re_names_full) bse = result.bse assert_(bse.index.tolist() == xnames + re_names_full) tvalues = result.tvalues assert_(tvalues.index.tolist() == xnames + re_names_full) cov_params = result.cov_params() assert_(cov_params.index.tolist() == xnames + re_names_full) assert_(cov_params.columns.tolist() == xnames + re_names_full) fe = result.fe_params assert_(fe.index.tolist() == xnames) bse_fe = result.bse_fe assert_(bse_fe.index.tolist() == xnames) cov_re = result.cov_re assert_(cov_re.index.tolist() == re_names) assert_(cov_re.columns.tolist() == re_names) cov_re_u = result.cov_re_unscaled assert_(cov_re_u.index.tolist() == re_names) assert_(cov_re_u.columns.tolist() == re_names) bse_re = result.bse_re assert_(bse_re.index.tolist() == re_names_full)
def test_vcomp_formula(self): np.random.seed(6241) n = 800 exog = np.random.normal(size=(n, 2)) exog[:, 0] = 1 ex_vc = [] groups = np.kron(np.arange(n / 4), np.ones(4)) errors = 0 exog_re = np.random.normal(size=(n, 2)) slopes = np.random.normal(size=(n // 4, 2)) slopes = np.kron(slopes, np.ones((4, 1))) * exog_re errors += slopes.sum(1) ex_vc = np.random.normal(size=(n, 4)) slopes = np.random.normal(size=(n // 4, 4)) slopes[:, 2:] *= 2 slopes = np.kron(slopes, np.ones((4, 1))) * ex_vc errors += slopes.sum(1) errors += np.random.normal(size=n) endog = exog.sum(1) + errors exog_vc = {"a": {}, "b": {}} for k, group in enumerate(range(int(n / 4))): ix = np.flatnonzero(groups == group) exog_vc["a"][group] = ex_vc[ix, 0:2] exog_vc["b"][group] = ex_vc[ix, 2:] model1 = MixedLM(endog, exog, groups, exog_re=exog_re, exog_vc=exog_vc) result1 = model1.fit() df = pd.DataFrame(exog[:, 1:], columns=["x1"]) df["y"] = endog df["re1"] = exog_re[:, 0] df["re2"] = exog_re[:, 1] df["vc1"] = ex_vc[:, 0] df["vc2"] = ex_vc[:, 1] df["vc3"] = ex_vc[:, 2] df["vc4"] = ex_vc[:, 3] vc_formula = {"a": "0 + vc1 + vc2", "b": "0 + vc3 + vc4"} model2 = MixedLM.from_formula( "y ~ x1", groups=groups, re_formula="0 + re1 + re2", vc_formula=vc_formula, data=df) result2 = model2.fit() assert_allclose(result1.fe_params, result2.fe_params, rtol=1e-8) assert_allclose(result1.cov_re, result2.cov_re, rtol=1e-8) assert_allclose(result1.vcomp, result2.vcomp, rtol=1e-8) assert_allclose(result1.params, result2.params, rtol=1e-8) assert_allclose(result1.bse, result2.bse, rtol=1e-8)
def mass_uv_mixedlmm(formula, data, uv_data, group_id, re_formula=None): mods = [] for d_idx in range(uv_data.shape[1]): print("{} of {}".format(d_idx, uv_data.shape[1]), end="\r") data_temp = data.copy() data_temp["Brain"] = uv_data[:, d_idx] model = MixedLM.from_formula(formula, data_temp, groups=group_id) try: mod_fit = model.fit() except: mods.append(None) continue mods.append(mod_fit) return mods
def mass_uv_mixedlmm(formula, data, uv_data, group_id, re_formula=None): mods = [[] for source_idx in range(uv_data.shape[1])] for source_idx in range(uv_data.shape[1]): for dest_idx in range(uv_data.shape[2]): if all(uv_data[:, source_idx, dest_idx] == 0): mods[source_idx].append(None) continue #print("Source {}, Destination {}".format(source_idx, dest_idx), end="\r") print("Source {}, Destination {}".format(source_idx, dest_idx)) data_temp = data.copy() data_temp["Brain"] = uv_data[:, source_idx, dest_idx] model = MixedLM.from_formula(formula, data_temp, groups=group_id) mod_fit = model.fit() mods[source_idx].append(mod_fit) return mods
def mass_uv_mixedlmm(formula, data, uv_data, group_id, re_formula=None, exclude=[]): tvals = [] coeffs = [] for d_idx in range(uv_data.shape[1]): if d_idx in exclude: tvals.append(0) coeffs.append(0) continue data_temp = data.copy() data_temp["Brain"] = uv_data[:,d_idx] model = MixedLM.from_formula(formula, data_temp, groups=group_id) mod_fit = model.fit() tvals.append(mod_fit.tvalues.get(indep_var)) coeffs.append(mod_fit.params.get(indep_var)) tvals, coeffs = np.array(tvals), np.array(coeffs) return tvals, coeffs
def test_singular(): # Issue #7051 np.random.seed(3423) n = 100 data = np.random.randn(n, 2) df = pd.DataFrame(data, columns=['Y', 'X']) df['class'] = pd.Series([i % 3 for i in df.index], index=df.index) with pytest.warns(Warning) as wrn: md = MixedLM.from_formula("Y ~ X", df, groups=df['class']) mdf = md.fit() mdf.summary() if not wrn: pytest.fail("warning expected")
def fit_func(rdf): md = MixedLM.from_formula("supply_hours ~ 1 + delta_weeks", groups='block_dow', re_formula='1 + delta_weeks', data=rdf.fillna({'supply_hours': 0.})) mdf = md.fit() index = mdf.random_effects.keys() data = { 'supply_hours': (mdf.params['Intercept'] + [mdf.random_effects[i]['Intercept'] for i in index]), 'block_dow': index } result = pd.DataFrame(data).set_index('block_dow') return result
def lmemodel(data, metadata, fixedEffects=['Tissue of Origin'], randomEffects=['High Confidence Donor ID (HCDID)']): """Performs a mixed effect linear model""" df = metadata[fixedEffects].copy() df = pd.concat([df, metadata[randomEffects]], axis=1) #Change the parameters to be compatible with patsy formulas fixedEffects = [ c.translate(string.maketrans(' ()', '___')) for c in fixedEffects ] randomEffects = [ c.translate(string.maketrans(' ()', '___')) for c in randomEffects ] df.columns = [ c.translate(string.maketrans(' ()', '___')) for c in df.columns ] model_string = 'gene ~ ' + ' + '.join(fixedEffects) results = [] for i in range(data.shape[0]): #Add the dependent variable to the dataframe df['gene'] = data.irow(i) ################# df['High_Confidence_Donor_ID__HCDID_'] = stats.binom.rvs(1, .4, size=69) print df.shape, model_string return df df = df.dropna() print df.shape ################# #compute new model mod = MixedLM.from_formula(model_string, df, groups=df[randomEffects]) return mod #df.boxplot(by=fixedEffects) #mod = sm.ols(model_fit, df) #results.append(mod.fit()) return results
""" try: from statsmodels.regression.mixed_linear_model import MixedLM raw_df = se_df.copy() 9b50c5aedf52 · D1691991 raw_df['delta_weeks'] = (pd.to_datetime(raw_df['week_of']) - pd.to_datetime(recommendation_week)).dt.days / 7 17a83f0a52b1 · D1438409 def fit_func(group): ea0c134be68e · D1540393 try: md = MixedLM.from_formula("{} ~ 1 + delta_weeks".format(metric), groups='block_dow', re_formula='1 + delta_weeks', data=group.fillna({metric: 0.}) ) mdf = md.fit() index = mdf.random_effects.keys() data = { metric: (mdf.params['Intercept'] + [mdf.random_effects[i]['Intercept'] for i in index]), 'block_dow': index, } return pd.DataFrame(data).set_index('block_dow') except np.linalg.linalg.LinAlgError as err: logging.warning(err) dca42ed79c75 · D2335295
cnx_col_inds = list(np.where(cnx_masks[ROI_idx,])[0]) for col_idx in cnx_col_inds: this_point = this_epo[ROI_idx,col_idx].copy() outname = label_names[col_idx] outhemi = "lh" if "lh" in outname else "rh" data_dict["Brain"].append(this_point) data_dict["Subj"].append(sub) data_dict["Block"].append(cond) data_dict["ROI"].append(ROI) data_dict["OutRegion"].append(outname) data_dict["Hemi"].append(outhemi) data_dict["RT"].append(epo.metadata["RT"].iloc[epo_idx]) group_id.append(sub_idx) dm = pd.DataFrame.from_dict(data_dict) group_id = np.array(group_id) formula = "RT ~ Brain*Block + Brain*Block*C(ROI, Treatment('L3969-lh'))" formula = "Brain ~ RT*Block" mfs = [] for ROI in ROIs: this_dm = dm.copy() this_dm = this_dm[this_dm["ROI"]==ROI] this_group_id = group_id[(dm["ROI"]==ROI)] mod = MixedLM.from_formula(formula, this_dm, groups=this_group_id) mfs.append(mod.fit(reml=False)) formula = "RT ~ Block" mod_rt = MixedLM.from_formula(formula, dm, groups=group_id) mf_rt = mod_rt.fit()
def test_get_distribution(): np.random.seed(234) n = 100 n_groups = 10 fe_params = np.r_[1, -2] cov_re = np.asarray([[1, 0.5], [0.5, 2]]) vcomp = np.r_[0.5**2, 1.5**2] scale = 1.5 exog_fe = np.random.normal(size=(n, 2)) exog_re = np.random.normal(size=(n, 2)) exog_vca = np.random.normal(size=(n, 2)) exog_vcb = np.random.normal(size=(n, 2)) groups = np.repeat(np.arange(n_groups, dtype=np.int), n / n_groups) ey = np.dot(exog_fe, fe_params) u = np.random.normal(size=(n_groups, 2)) u = np.dot(u, np.linalg.cholesky(cov_re).T) u1 = np.sqrt(vcomp[0]) * np.random.normal(size=(n_groups, 2)) u2 = np.sqrt(vcomp[1]) * np.random.normal(size=(n_groups, 2)) y = ey + (u[groups, :] * exog_re).sum(1) y += (u1[groups, :] * exog_vca).sum(1) y += (u2[groups, :] * exog_vcb).sum(1) y += np.sqrt(scale) * np.random.normal(size=n) df = pd.DataFrame({ "y": y, "x1": exog_fe[:, 0], "x2": exog_fe[:, 1], "z0": exog_re[:, 0], "z1": exog_re[:, 1], "grp": groups }) df["z2"] = exog_vca[:, 0] df["z3"] = exog_vca[:, 1] df["z4"] = exog_vcb[:, 0] df["z5"] = exog_vcb[:, 1] vcf = {"a": "0 + z2 + z3", "b": "0 + z4 + z5"} m = MixedLM.from_formula("y ~ 0 + x1 + x2", groups="grp", re_formula="0 + z0 + z1", vc_formula=vcf, data=df) # Build a params vector that is comparable to # MixedLMResults.params import statsmodels mp = statsmodels.regression.mixed_linear_model.MixedLMParams po = mp.from_components(fe_params=fe_params, cov_re=cov_re, vcomp=vcomp) pa = po.get_packed(has_fe=True, use_sqrt=False) pa[len(fe_params):] /= scale # Get a realization dist = m.get_distribution(pa, scale, None) yr = dist.rvs(0) # Check the overall variance v = (np.dot(exog_re, cov_re) * exog_re).sum(1).mean() v += vcomp[0] * (exog_vca**2).sum(1).mean() v += vcomp[1] * (exog_vcb**2).sum(1).mean() v += scale assert_allclose(np.var(yr - ey), v, rtol=1e-2, atol=1e-4)
def test_pastes_vcomp(self): # pastes data from lme4 # # Fit in R using: # # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data) # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data, # reml=FALSE) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'pastes.csv') data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} # REML model = MixedLM.from_formula( "strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() # fixef(r) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) # sqrt(diag(vcov(r))) assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3) # VarCorr(r)$batch[[1]] assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 0.678, rtol=1e-3) # logLik(r) assert_allclose(result.llf, -123.49, rtol=1e-1) # don't provide aic/bic with REML assert_equal(result.aic, np.nan) assert_equal(result.bic, np.nan) # resid(r)[1:5] resid = np.r_[0.17133538, -0.02866462, -1.08662875, 1.11337125, -0.12093607] assert_allclose(result.resid[0:5], resid, rtol=1e-3) # predict(r)[1:5] fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094] assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4) # ML model = MixedLM.from_formula( "strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit(reml=False) # fixef(r) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) # sqrt(diag(vcov(r))) assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3) # VarCorr(r)$batch[[1]] assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 0.67799, rtol=1e-3) # logLik(r) assert_allclose(result.llf, -123.997, rtol=1e-1) # AIC(r) assert_allclose(result.aic, 255.9944, rtol=1e-3) # BIC(r) assert_allclose(result.bic, 264.3718, rtol=1e-3)
# How to estimate multilevel GLM in statsmodels package # We only show example for gaussian model, because gamma model is not implemented in the package # see: https://www.statsmodels.org/devel/mixed_glm.html from pandas import read_csv from statsmodels.regression.mixed_linear_model import MixedLM if __name__ == '__main__': # Requires to set the working directory to the project directory gaussian_data = read_csv("./data/Gaussian_identity_data.csv") model = MixedLM.from_formula("y ~ x1 + x2 + x3", data=gaussian_data, groups=gaussian_data["group_index"]) model_result = model.fit() # Fixed effects print(model_result.summary()) # Random effects print(model_result.random_effects) # Dispersion parameter print(model_result.scale)
def test_random_effects_getters(): # Simulation-based test to make sure that the BLUPs and actual # random effects line up. np.random.seed(34234) ng = 500 # number of groups m = 10 # group size y, x, z, v0, v1, g, b, c0, c1 = [], [], [], [], [], [], [], [], [] for i in range(ng): # Fixed effects xx = np.random.normal(size=(m, 2)) yy = xx[:, 0] + 0.5 * np.random.normal(size=m) # Random effects (re_formula) zz = np.random.normal(size=(m, 2)) bb = np.random.normal(size=2) bb[0] *= 3 bb[1] *= 1 yy += np.dot(zz, bb).flat b.append(bb) # First variance component vv0 = np.kron(np.r_[0, 1], np.ones(m // 2)).astype(np.int) cc0 = np.random.normal(size=2) yy += cc0[vv0] v0.append(vv0) c0.append(cc0) # Second variance component vv1 = np.kron(np.ones(m // 2), np.r_[0, 1]).astype(np.int) cc1 = np.random.normal(size=2) yy += cc1[vv1] v1.append(vv1) c1.append(cc1) y.append(yy) x.append(xx) z.append(zz) g.append(["g%d" % i] * m) y = np.concatenate(y) x = np.concatenate(x) z = np.concatenate(z) v0 = np.concatenate(v0) v1 = np.concatenate(v1) g = np.concatenate(g) df = pd.DataFrame({ "y": y, "x0": x[:, 0], "x1": x[:, 1], "z0": z[:, 0], "z1": z[:, 1], "v0": v0, "v1": v1, "g": g }) b = np.asarray(b) c0 = np.asarray(c0) c1 = np.asarray(c1) cc = np.concatenate((c0, c1), axis=1) model = MixedLM.from_formula("y ~ x0 + x1", re_formula="~0 + z0 + z1", vc_formula={ "v0": "~0+C(v0)", "v1": "0+C(v1)" }, groups="g", data=df) result = model.fit() ref = result.random_effects b0 = [ref["g%d" % k][0:2] for k in range(ng)] b0 = np.asarray(b0) assert (np.corrcoef(b0[:, 0], b[:, 0])[0, 1] > 0.8) assert (np.corrcoef(b0[:, 1], b[:, 1])[0, 1] > 0.8) cf0 = [ref["g%d" % k][2:6] for k in range(ng)] cf0 = np.asarray(cf0) for k in range(4): assert (np.corrcoef(cf0[:, k], cc[:, k])[0, 1] > 0.8) # Smoke test for predictive covariances refc = result.random_effects_cov for g in refc.keys(): p = ref[g].size assert (refc[g].shape == (p, p))
def big_ass_matrix(df, y, x, group = None, short = True) : independent = combinatorial(x, short) models = {} p = {} aic = {} r2 = {} best = {} dfs = {} bestdf = {} for dependent in y : print "Regressing for %s" % dependent for covariate in independent : if group is None : subset = delayer([covariate, dependent]) df2 = df[subset].dropna() df2["Intercept"] = np.ones(len(df2)) dfs.setdefault(dependent, []).append(df2) ols = sm.GLS(endog=df2[dependent], exog=df2[delayer([covariate, "Intercept"])]).fit() models.setdefault(dependent, []).append(ols) p.setdefault(dependent, []).append(ols.pvalues[:-1].values) aic.setdefault(dependent, []).append(ols.aic) r2.setdefault(dependent, []).append(ols.rsquared) else : subset = delayer([covariate, dependent, group]) df2 = df[subset].dropna() dfs.setdefault(dependent, []).append(df2) ols = MixedLM.from_formula(rstr(y=dependent, x=covariate), data=df2, groups=df2[group]).fit() models.setdefault(dependent, []).append(ols) aic.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf) p.setdefault(dependent, []).append(ols.pvalues[1:-1].values) r2.setdefault(dependent, []).append(mmR2(df2, ols)) bestAIC = np.min(aic[dependent]) for i, val in enumerate(models[dependent]) : if aic[dependent][i] < 2 + bestAIC : if np.sum(p[dependent][i] > 0.05) == 0 : if group is None : best.setdefault(dependent, []).append(val) bestdf.setdefault(dependent, []).append(dfs[dependent][i]) else : if val.random_effects.abs().mean()[0] > 0.01 : best.setdefault(dependent, []).append(val) bestdf.setdefault(dependent, []).append(dfs[dependent][i]) if best.has_key(dependent) : for i, model in enumerate(best[dependent]) : if not os.path.exists("regressions/%s" % dependent) : os.mkdir("regressions/%s" % dependent) if not os.path.exists("../talk/figures/regressions/%s" % dependent) : os.mkdir("../talk/figures/regressions/%s" % dependent) if group is None : dfx = bestdf[dependent][i] plt.scatter(model.fittedvalues.values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[0]) plt.plot(dfx[model.model.endog_names].values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[2]) plt.ylabel(model.model.endog_names) yl = model.model.exog_names[:] yl.remove("Intercept") plt.xlabel("Estimate using " + ", ".join(yl)) plt.title(rstr(dependent, model.model.exog_names).replace(" + Intercept", "")) #plt.title(r"$R^2$ = %.02f" % model.rsquared) st = ("$R^2$ = %.03f\n\n"% model.rsquared) for coefnum, coef in enumerate(yl) : st += ("%s" % coef) st += (" : %.03f\n" % model.params[coef]) st += ("$p$ = %.01e\n\n" % model.pvalues[coefnum]) #plt.suptitle(st) plt.text(0.01, .99, st, va="top", ha="left") plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.savefig("regressions/%s/lm-%d.pdf" % (dependent, i)) plt.savefig("../talk/figures/regressions/%s/lm-%d.png" % (dependent, i), dpi=300, jpeg_quality=90) plt.close() else : dfx = bestdf[dependent][i] y, yhat = mmPredict(model.model.data.frame, model) plt.scatter(yhat, y, c=seaborn.color_palette("deep", 8)[0]) plt.plot(y, y, c=seaborn.color_palette("deep", 8)[2]) plt.ylabel(model.model.endog_names) yl = model.model.exog_names[:] yl.remove("Intercept") plt.xlabel("Estimate using " + ", ".join(yl)) plt.title(rstr(dependent, model.model.exog_names).replace("Intercept + ", "")) #plt.title(r"$R^2$ = %.02f" % mmR2(dfx, model)) st = ("$R^2$ = %.03f\n\n" % mmR2(dfx, model)) for coefnum, coef in enumerate(yl) : st += coef st += " : %.03f\n" % model.fe_params[1+coefnum] st += "$p$ = %.01e\n\n" % model.pvalues[coef] st += ("Avg. abs. RE coef. : %.03f" % model.random_effects.abs().mean()) plt.text(0.01, .99, st, va="top", ha="left") plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.savefig("regressions/%s/mm_%d.pdf" % (dependent, i)) plt.savefig("../talk/figures/regressions/%s/mm_%d.png" % (dependent, i), dpi=300, jpeg_quality=90) plt.close() return best, (models, p, r2, aic)
def test_pastes_vcomp(self): # pastes data from lme4 # # Fit in R using: # # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data) # r = lmer(strength ~ (1|batch) + (1|batch:cask), data=data, # reml=FALSE) cur_dir = os.path.dirname(os.path.abspath(__file__)) rdir = os.path.join(cur_dir, 'results') fname = os.path.join(rdir, 'pastes.csv') data = pd.read_csv(fname) vcf = {"cask": "0 + cask"} # REML model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit() # fixef(r) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) # sqrt(diag(vcov(r))) assert_allclose(result.bse.iloc[0], 0.6769, rtol=1e-3) # VarCorr(r)$batch[[1]] assert_allclose(result.cov_re.iloc[0, 0], 1.657, rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 0.678, rtol=1e-3) # logLik(r) assert_allclose(result.llf, -123.49, rtol=1e-1) # do not provide aic/bic with REML assert_equal(result.aic, np.nan) assert_equal(result.bic, np.nan) # resid(r)[1:5] resid = np.r_[0.17133538, -0.02866462, -1.08662875, 1.11337125, -0.12093607] assert_allclose(result.resid[0:5], resid, rtol=1e-3) # predict(r)[1:5] fit = np.r_[62.62866, 62.62866, 61.18663, 61.18663, 62.82094] assert_allclose(result.fittedvalues[0:5], fit, rtol=1e-4) # ML model = MixedLM.from_formula("strength ~ 1", groups="batch", re_formula="1", vc_formula=vcf, data=data) result = model.fit(reml=False) # fixef(r) assert_allclose(result.fe_params.iloc[0], 60.0533, rtol=1e-3) # sqrt(diag(vcov(r))) assert_allclose(result.bse.iloc[0], 0.642, rtol=1e-3) # VarCorr(r)$batch[[1]] assert_allclose(result.cov_re.iloc[0, 0], 1.199, rtol=1e-3) # attr(VarCorr(r), "sc")^2 assert_allclose(result.scale, 0.67799, rtol=1e-3) # logLik(r) assert_allclose(result.llf, -123.997, rtol=1e-1) # AIC(r) assert_allclose(result.aic, 255.9944, rtol=1e-3) # BIC(r) assert_allclose(result.bic, 264.3718, rtol=1e-3)
continue subj, block, trial = match.group(1), match.group(2), match.group(3) print(filename) stc = mne.read_source_estimate("{}/stcs/{}".format(proc_dir,filename)) stc = morphs["ATT_"+subj].apply(stc) ev_str = "Subj=='ATT_{}' and Block=='{}' and TrialIdx=={}".format(subj, block, int(trial)) row_idx = np.where(np.array(df.eval(ev_str)))[0] temp_data = mne.extract_label_time_course(stc,labels,fs_src,mode="mean") temp_data = temp_data.mean(axis=1) for lab_idx, ln in enumerate(stats_label_names): df.at[row_idx,ln] = temp_data[lab_idx] df = df.astype({ln:np.float64 for ln in stats_label_names}) for ln_idx, ln in enumerate(stats_label_names): formula = "{} ~ 1".format(ln) model = MixedLM.from_formula(formula, df, groups=df["Subj"]) mod_fit = model.fit(reml=False) mod_fit.save("{}{}/null_reg70_lmm_byresp_{}.pickle".format(lmm_dir,band,ln_idx)) formula = "{} ~ RT + Block".format(ln) re_formula = "1 + RT" model = MixedLM.from_formula(formula, df, groups=df["Subj"], re_formula=re_formula) mod_fit = model.fit(reml=False) mod_fit.save("{}{}/simple_reg70_lmm_byresp_{}.pickle".format(lmm_dir,band,ln_idx)) formula = "{} ~ RT*C(Block, Treatment('audio'))".format(ln) re_formula = "1 + RT" model = MixedLM.from_formula(formula, df, groups=df["Subj"], re_formula=re_formula) mod_fit = model.fit(reml=False)
#sel_inds = (df["Block"]==cond) & (df["Subj"]==sub) dPTE_slices = {} for k, v in pairs_info.items(): temp_inds = list(zip(*v["inds"])) dPTE_slices[k] = dPTE[:, temp_inds[0], temp_inds[1]].mean(axis=1) if avg_trials: dm = dm.append(df[sel_inds]) #dm = dm.append({"Subj":sub,"Block":cond},ignore_index=True) for k, v in dPTE_slices.items(): data[k].append(v.mean()) group_id.append(sub_idx) else: for epo_idx in range(len(dPTE)): dm = dm.append(df[sel_inds]) #dm = dm.append({"Subj":sub,"Block":cond},ignore_index=True) for k, v in dPTE_slices.items(): data[k].append(v[epo_idx, ]) group_id.append(sub_idx) formula = "Brain ~ Laut + Angenehm + C(Block, Treatment('audio')) + Wav" #formula = "Brain ~ C(Block, Treatment('rest'))" mod_fits = {} for k, v in data.items(): dm_temp = dm.copy() dm_temp["Brain"] = v model = MixedLM.from_formula(formula, dm_temp, groups=group_id) mod_fits[k] = model.fit()
def test_random_effects_getters(): # Simulation-based test to make sure that the BLUPs and actual # random effects line up. np.random.seed(34234) ng = 500 # number of groups m = 10 # group size y, x, z, v0, v1, g, b, c0, c1 = [], [], [], [], [], [], [], [], [] for i in range(ng): # Fixed effects xx = np.random.normal(size=(m, 2)) yy = xx[:, 0] + 0.5 * np.random.normal(size=m) # Random effects (re_formula) zz = np.random.normal(size=(m, 2)) bb = np.random.normal(size=2) bb[0] *= 3 bb[1] *= 1 yy += np.dot(zz, bb).flat b.append(bb) # First variance component vv0 = np.kron(np.r_[0, 1], np.ones(m // 2)).astype(np.int) cc0 = np.random.normal(size=2) yy += cc0[vv0] v0.append(vv0) c0.append(cc0) # Second variance component vv1 = np.kron(np.ones(m // 2), np.r_[0, 1]).astype(np.int) cc1 = np.random.normal(size=2) yy += cc1[vv1] v1.append(vv1) c1.append(cc1) y.append(yy) x.append(xx) z.append(zz) g.append(["g%d" % i] * m) y = np.concatenate(y) x = np.concatenate(x) z = np.concatenate(z) v0 = np.concatenate(v0) v1 = np.concatenate(v1) g = np.concatenate(g) df = pd.DataFrame({ "y": y, "x0": x[:, 0], "x1": x[:, 1], "z0": z[:, 0], "z1": z[:, 1], "v0": v0, "v1": v1, "g": g }) b = np.asarray(b) c0 = np.asarray(c0) c1 = np.asarray(c1) cc = np.concatenate((c0, c1), axis=1) model = MixedLM.from_formula( "y ~ x0 + x1", re_formula="~0 + z0 + z1", vc_formula={ "v0": "~0+C(v0)", "v1": "0+C(v1)" }, groups="g", data=df) result = model.fit() ref = result.random_effects b0 = [ref["g%d" % k][0:2] for k in range(ng)] b0 = np.asarray(b0) assert (np.corrcoef(b0[:, 0], b[:, 0])[0, 1] > 0.8) assert (np.corrcoef(b0[:, 1], b[:, 1])[0, 1] > 0.8) cf0 = [ref["g%d" % k][2:6] for k in range(ng)] cf0 = np.asarray(cf0) for k in range(4): assert (np.corrcoef(cf0[:, k], cc[:, k])[0, 1] > 0.8) # Smoke test for predictive covariances refc = result.random_effects_cov for g in refc.keys(): p = ref[g].size assert (refc[g].shape == (p, p))
axes = [ax for sublist in axes for ax in sublist] for block_idx,block in enumerate(blocks): angs = [] for wav in wavs: angs.append(df_ang.loc[df_laut["Wav"]==wav]["Angenehm"][df_laut["Block"]==block].values) angs = np.array(angs) angs_mean = np.mean(angs,axis=1) print(angs_mean) sem = stats.sem(angs,axis=1) plt.sca(axes[block_idx]) plt.bar(np.arange(len(wavs)),angs_mean,yerr=sem,tick_label=wavs) plt.title(block) groups = df_laut["Subj"] formula = "Laut ~ Block*Wav" laut_model = MixedLM.from_formula(formula, df_laut, groups=groups) laut_mf = laut_model.fit() print(laut_mf.summary()) groups = df_ang["Subj"] formula = "Angenehm ~ Block*Wav" ang_model = MixedLM.from_formula(formula, df_ang, groups=groups) ang_mf = ang_model.fit() print(ang_mf.summary()) font = {'weight' : 'bold', 'size' : 38} matplotlib.rc('font', **font) fig, axes = plt.subplots(1, 2, figsize=(38.4, 21.6)) angs_block = []
def test_get_distribution(): np.random.seed(234) n = 100 n_groups = 10 fe_params = np.r_[1, -2] cov_re = np.asarray([[1, 0.5], [0.5, 2]]) vcomp = np.r_[0.5**2, 1.5**2] scale = 1.5 exog_fe = np.random.normal(size=(n, 2)) exog_re = np.random.normal(size=(n, 2)) exog_vca = np.random.normal(size=(n, 2)) exog_vcb = np.random.normal(size=(n, 2)) groups = np.repeat(np.arange(n_groups, dtype=np.int), n / n_groups) ey = np.dot(exog_fe, fe_params) u = np.random.normal(size=(n_groups, 2)) u = np.dot(u, np.linalg.cholesky(cov_re).T) u1 = np.sqrt(vcomp[0]) * np.random.normal(size=(n_groups, 2)) u2 = np.sqrt(vcomp[1]) * np.random.normal(size=(n_groups, 2)) y = ey + (u[groups, :] * exog_re).sum(1) y += (u1[groups, :] * exog_vca).sum(1) y += (u2[groups, :] * exog_vcb).sum(1) y += np.sqrt(scale) * np.random.normal(size=n) df = pd.DataFrame({"y": y, "x1": exog_fe[:, 0], "x2": exog_fe[:, 1], "z0": exog_re[:, 0], "z1": exog_re[:, 1], "grp": groups}) df["z2"] = exog_vca[:, 0] df["z3"] = exog_vca[:, 1] df["z4"] = exog_vcb[:, 0] df["z5"] = exog_vcb[:, 1] vcf = {"a": "0 + z2 + z3", "b": "0 + z4 + z5"} m = MixedLM.from_formula("y ~ 0 + x1 + x2", groups="grp", re_formula="0 + z0 + z1", vc_formula=vcf, data=df) # Build a params vector that is comparable to # MixedLMResults.params import statsmodels mp = statsmodels.regression.mixed_linear_model.MixedLMParams po = mp.from_components(fe_params=fe_params, cov_re=cov_re, vcomp=vcomp) pa = po.get_packed(has_fe=True, use_sqrt=False) pa[len(fe_params):] /= scale # Get a realization dist = m.get_distribution(pa, scale, None) yr = dist.rvs(0) # Check the overall variance v = (np.dot(exog_re, cov_re) * exog_re).sum(1).mean() v += vcomp[0] * (exog_vca**2).sum(1).mean() v += vcomp[1] * (exog_vcb**2).sum(1).mean() v += scale assert_allclose(np.var(yr - ey), v, rtol=1e-2, atol=1e-4)
if v["from"][0] == "all": from_inds = np.arange(mat_n) else: from_inds = np.array([label_names.index(x) for x in v["from"]]) from_mat = these_data[:, from_inds, ] from_mat = np.nanmean(from_mat, axis=1) if v["to"][0] == "all": to_inds = np.arange(mat_n) else: to_inds = np.array([label_names.index(x) for x in v["to"]]) to_mat = from_mat[:, to_inds] quant = np.nanmean(to_mat, axis=1) df["Brain"] = quant model = MixedLM.from_formula(formula, df, groups=group_id) mod_fit = model.fit(reml=False) print(mod_fit.summary()) stat_cond = "C(Block, Treatment('rest'))[T.task]" CIs = mod_fit.conf_int() mod_ests[k] = { "Rest": mod_fit.params["Intercept"], "Task": mod_fit.params[stat_cond], "Rest_CIs": np.array([CIs[0]["Intercept"], CIs[1]["Intercept"]]), "Task_CIs": np.array([CIs[0][stat_cond], CIs[1][stat_cond]]) } fig, ax = dpte_bar(mod_ests) else: these_data = data.copy() triu_inds, tril_inds = np.triu_indices(mat_n, k=1), np.tril_indices(mat_n, k=-1)
print " -------- 2" best_lm_hist, stuff_lm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group=None, short=5) print " -------- 3" best_mm_phys, stuff_mm_phys = big_ass_matrix(df=sheep, y=pcols, x=imagecols, group="AgeAtDeath", short=5) print " -------- 4" best_mm_hist, stuff_mm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group="AgeAtDeath", short=5) # <codecell> y = "BDHyperplasia" x = ["Inflammation", "Scale", "Directionality"] dfx = sheep[delayer([x, y, "AgeAtDeath"])].dropna() model = MixedLM.from_formula(rstr(y, x), data=dfx, groups="AgeAtDeath").fit() #model = sm.GLS(endog=dfx.Portal_inflammation, exog=dfx[["FociSize", "AgeAtDeath"]]).fit() dfx = sheep[["BDHyperplasia", "Inflammation", "AgeAtDeath"]].dropna() model2 = MixedLM.from_formula(rstr(y, ["Inflammation"]), data=dfx, groups="AgeAtDeath").fit() dfx = sheep[["BDHyperplasia", "FociSize", "AgeAtDeath"]].dropna() model3 = MixedLM.from_formula(rstr(y, ["FociSize"]), data=dfx, groups="AgeAtDeath").fit() # <codecell> ss = "E" s = np.array([sheep[sheep.AgeAtDeath == model.random_effects.index.values[i]][ss].iloc[0] for i in range(len(model.random_effects.index.values))]) s -= s.min() s /= s.max()