def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Check default variance structure, with formula.api exog_re = np.ones(len(endog), dtype=np.float64) mod3 = MixedLM(endog, exog, groups, exog_re) rslt3 = mod3.fit() from statsmodels.formula.api import mixedlm mod4 = mixedlm(fml, df, groups=groups) rslt4 = mod4.fit() assert_almost_equal(rslt3.params, rslt4.params)
def txest_vcomp_1(self): """ Fit the same model using constrained random effects and variance components. """ np.random.seed(4279) exog = np.random.normal(size=(400, 1)) exog_re = np.random.normal(size=(400, 2)) groups = np.kron(np.arange(100), np.ones(4)) slopes = np.random.normal(size=(100, 2)) slopes[:, 1] *= 2 slopes = np.kron(slopes, np.ones((4, 1))) * exog_re errors = slopes.sum(1) + np.random.normal(size=400) endog = exog.sum(1) + errors free = MixedLMParams(1, 2, 0) free.fe_params = np.ones(1) free.cov_re = np.eye(2) free.vcomp = np.zeros(0) model1 = MixedLM(endog, exog, groups, exog_re=exog_re) result1 = model1.fit(free=free) exog_vc = {"a": {}, "b": {}} for k,group in enumerate(model1.group_labels): ix = model1.row_indices[group] exog_vc["a"][group] = exog_re[ix, 0:1] exog_vc["b"][group] = exog_re[ix, 1:2] model2 = MixedLM(endog, exog, groups, exog_vc=exog_vc) result2 = model2.fit() result2.summary() assert_allclose(result1.fe_params, result2.fe_params, atol=1e-4) assert_allclose(np.diag(result1.cov_re), result2.vcomp, atol=1e-2, rtol=1e-4) assert_allclose(result1.bse[[0, 1, 3]], result2.bse, atol=1e-2, rtol=1e-2)
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) # test the names assert_(mod1.data.xnames == ["x1", "x2", "x3", "x4"]) assert_(mod1.data.exog_re_names == ["x_re1"]) assert_(mod1.data.exog_re_names_full == ["x_re1 RE"]) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) assert_(mod2.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod2.data.exog_re_names == ["exog_re"]) assert_(mod2.data.exog_re_names_full == ["exog_re RE"]) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") assert_(mod3.data.xnames == ["exog0", "exog1", "exog2", "exog3"]) assert_(mod3.data.exog_re_names == ["exog_re"]) assert_(mod3.data.exog_re_names_full == ["exog_re RE"]) rslt3 = mod3.fit(start_params=rslt2.params) assert_allclose(rslt1.params, rslt3.params, rtol=1e-4) # Check default variance structure with non-formula model # creation, also use different exog_re that produces a zero # estimated variance parameter. exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt4 = mod4.fit() from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") assert_(mod5.data.exog_re_names == ["groups"]) assert_(mod5.data.exog_re_names_full == ["groups RE"]) with warnings.catch_warnings(): warnings.simplefilter("ignore") rslt5 = mod5.fit() assert_almost_equal(rslt4.params, rslt5.params)
def do1(self, reml, irf, ds_ix): # No need to check independent random effects when there is # only one of them. if irf and ds_ix < 6: return irfs = "irf" if irf else "drf" meth = "reml" if reml else "ml" rslt = R_Results(meth, irfs, ds_ix) # Fit the model md = MixedLM(rslt.endog, rslt.exog_fe, rslt.groups, rslt.exog_re) if not irf: # Free random effects covariance mdf = md.fit(gtol=1e-7, reml=reml) else: # Independent random effects k_fe = rslt.exog_fe.shape[1] k_re = rslt.exog_re.shape[1] mdf = md.fit(reml=reml, gtol=1e-7, free=(np.ones(k_fe), np.eye(k_re))) assert_almost_equal(mdf.fe_params, rslt.coef, decimal=4) assert_almost_equal(mdf.cov_re, rslt.cov_re_r, decimal=4) assert_almost_equal(mdf.scale, rslt.scale_r, decimal=4) pf = rslt.exog_fe.shape[1] assert_almost_equal(rslt.vcov_r, mdf.cov_params()[0:pf, 0:pf], decimal=3) assert_almost_equal(mdf.likeval, rslt.loglike[0], decimal=2) # Not supported in R if not irf: assert_almost_equal(mdf.ranef()[0], rslt.ranef_postmean, decimal=3) assert_almost_equal(mdf.ranef_cov()[0], rslt.ranef_condvar, decimal=3)
def do1(reml, irf, ds_ix): # No need to check independent random effects when there is # only one of them. if irf and ds_ix < 6: return irfs = "irf" if irf else "drf" meth = "reml" if reml else "ml" rslt = R_Results(meth, irfs, ds_ix) # Fit the model md = MixedLM(rslt.endog, rslt.exog_fe, rslt.groups, rslt.exog_re) if not irf: # Free random effects covariance if np.any(np.diag(rslt.cov_re_r) < 1e-5): with warnings.catch_warnings(): warnings.simplefilter("ignore") mdf = md.fit(gtol=1e-7, reml=reml) else: mdf = md.fit(gtol=1e-7, reml=reml) else: # Independent random effects k_fe = rslt.exog_fe.shape[1] k_re = rslt.exog_re.shape[1] free = MixedLMParams(k_fe, k_re, 0) free.fe_params = np.ones(k_fe) free.cov_re = np.eye(k_re) free.vcomp = np.array([]) if np.any(np.diag(rslt.cov_re_r) < 1e-5): with warnings.catch_warnings(): warnings.simplefilter("ignore") mdf = md.fit(reml=reml, gtol=1e-7, free=free) else: mdf = md.fit(reml=reml, gtol=1e-7, free=free) assert_almost_equal(mdf.fe_params, rslt.coef, decimal=4) assert_almost_equal(mdf.cov_re, rslt.cov_re_r, decimal=4) assert_almost_equal(mdf.scale, rslt.scale_r, decimal=4) k_fe = md.k_fe assert_almost_equal(rslt.vcov_r, mdf.cov_params()[0:k_fe, 0:k_fe], decimal=3) assert_almost_equal(mdf.llf, rslt.loglike[0], decimal=2) # Not supported in R except for independent random effects if not irf: assert_almost_equal(mdf.random_effects[0], rslt.ranef_postmean, decimal=3) assert_almost_equal(mdf.random_effects_cov[0], rslt.ranef_condvar, decimal=3)
def test_vcomp_1(self): """ Fit the same model using constrained random effects and variance components. """ import scipy v = scipy.__version__.split(".")[1] v = int(v) if v < 16: return np.random.seed(4279) exog = np.random.normal(size=(400, 1)) exog_re = np.random.normal(size=(400, 2)) groups = np.kron(np.arange(100), np.ones(4)) slopes = np.random.normal(size=(100, 2)) slopes[:, 1] *= 2 slopes = np.kron(slopes, np.ones((4, 1))) * exog_re errors = slopes.sum(1) + np.random.normal(size=400) endog = exog.sum(1) + errors free = MixedLMParams(1, 2, 0) free.fe_params = np.ones(1) free.cov_re = np.eye(2) free.vcomp = np.zeros(0) model1 = MixedLM(endog, exog, groups, exog_re=exog_re) result1 = model1.fit(free=free) exog_vc = {"a": {}, "b": {}} for k, group in enumerate(model1.group_labels): ix = model1.row_indices[group] exog_vc["a"][group] = exog_re[ix, 0:1] exog_vc["b"][group] = exog_re[ix, 1:2] model2 = MixedLM(endog, exog, groups, exog_vc=exog_vc) result2 = model2.fit() result2.summary() assert_allclose(result1.fe_params, result2.fe_params, atol=1e-4) assert_allclose(np.diag(result1.cov_re), result2.vcomp, atol=1e-2, rtol=1e-4) assert_allclose(result1.bse[[0, 1, 3]], result2.bse, atol=1e-2, rtol=1e-2)
def __fit__(correctors, correctors_re, groups, predictors, observations, sample_weight=None, n_jobs=-1, *args, **kwargs): ncols = correctors.shape[1] dims = (correctors.shape[0], ncols + predictors.shape[1]) xdata = np.zeros(dims) xdata[:, :ncols] = correctors.view() xdata[:, ncols:] = predictors.view() M = observations.shape[1] K = correctors.shape[1] params = np.empty((K, M), dtype=object) for it_m in range(M): free = MixedLMParams.from_components( fe_params=np.ones(xdata.shape[1]), cov_re=np.eye(correctors_re.shape[1])) model = MixedLM(endog=observations, exog=xdata, groups=groups, exog_re=correctors_re) results = model.fit(free=free) params[..., it_m] = free return (params[:ncols], params[ncols:])
def test_history(self): np.random.seed(3235) exog = np.random.normal(size=(300, 4)) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod = MixedLM(endog, exog, groups) rslt = mod.fit(full_output=True) assert_equal(hasattr(rslt, "hist"), True)
def test_random_effects(): np.random.seed(23429) # Default model (random effects only) ngrp = 100 gsize = 10 rsd = 2 gsd = 3 mn = gsd * np.random.normal(size=ngrp) gmn = np.kron(mn, np.ones(gsize)) y = gmn + rsd * np.random.normal(size=ngrp * gsize) gr = np.kron(np.arange(ngrp), np.ones(gsize)) x = np.ones(ngrp * gsize) model = MixedLM(y, x, groups=gr) result = model.fit() re = result.random_effects assert_(isinstance(re, dict)) assert_(len(re) == ngrp) assert_(isinstance(re[0], pd.Series)) assert_(len(re[0]) == 1) # Random intercept only, set explicitly model = MixedLM(y, x, exog_re=x, groups=gr) result = model.fit() re = result.random_effects assert_(isinstance(re, dict)) assert_(len(re) == ngrp) assert_(isinstance(re[0], pd.Series)) assert_(len(re[0]) == 1) # Random intercept and slope xr = np.random.normal(size=(ngrp * gsize, 2)) xr[:, 0] = 1 qp = np.linspace(-1, 1, gsize) xr[:, 1] = np.kron(np.ones(ngrp), qp) model = MixedLM(y, x, exog_re=xr, groups=gr) result = model.fit() re = result.random_effects assert_(isinstance(re, dict)) assert_(len(re) == ngrp) assert_(isinstance(re[0], pd.Series)) assert_(len(re[0]) == 2)
def do1(self, reml, irf, ds_ix): # No need to check independent random effects when there is # only one of them. if irf and ds_ix < 6: return irfs = "irf" if irf else "drf" meth = "reml" if reml else "ml" rslt = R_Results(meth, irfs, ds_ix) # Fit the model md = MixedLM(rslt.endog, rslt.exog_fe, rslt.groups, rslt.exog_re) if not irf: # Free random effects covariance mdf = md.fit(gtol=1e-7, reml=reml) else: # Independent random effects k_fe = rslt.exog_fe.shape[1] k_re = rslt.exog_re.shape[1] free = MixedLMParams(k_fe, k_re) free.set_fe_params(np.ones(k_fe)) free.set_cov_re(np.eye(k_re)) mdf = md.fit(reml=reml, gtol=1e-7, free=free) assert_almost_equal(mdf.fe_params, rslt.coef, decimal=4) assert_almost_equal(mdf.cov_re, rslt.cov_re_r, decimal=4) assert_almost_equal(mdf.scale, rslt.scale_r, decimal=4) pf = rslt.exog_fe.shape[1] assert_almost_equal(rslt.vcov_r, mdf.cov_params()[0:pf, 0:pf], decimal=3) assert_almost_equal(mdf.llf, rslt.loglike[0], decimal=2) # Not supported in R if not irf: assert_almost_equal(mdf.random_effects.ix[0], rslt.ranef_postmean, decimal=3) assert_almost_equal(mdf.random_effects_cov[0], rslt.ranef_condvar, decimal=3)
def test_vcomp_formula(self): np.random.seed(6241) n = 800 exog = np.random.normal(size=(n, 2)) exog[:, 0] = 1 ex_vc = [] groups = np.kron(np.arange(n / 4), np.ones(4)) errors = 0 exog_re = np.random.normal(size=(n, 2)) slopes = np.random.normal(size=(n // 4, 2)) slopes = np.kron(slopes, np.ones((4, 1))) * exog_re errors += slopes.sum(1) ex_vc = np.random.normal(size=(n, 4)) slopes = np.random.normal(size=(n // 4, 4)) slopes[:, 2:] *= 2 slopes = np.kron(slopes, np.ones((4, 1))) * ex_vc errors += slopes.sum(1) errors += np.random.normal(size=n) endog = exog.sum(1) + errors exog_vc = {"a": {}, "b": {}} for k, group in enumerate(range(int(n / 4))): ix = np.flatnonzero(groups == group) exog_vc["a"][group] = ex_vc[ix, 0:2] exog_vc["b"][group] = ex_vc[ix, 2:] with pytest.warns(UserWarning, match="Using deprecated variance"): model1 = MixedLM(endog, exog, groups, exog_re=exog_re, exog_vc=exog_vc) result1 = model1.fit() df = pd.DataFrame(exog[:, 1:], columns=["x1"]) df["y"] = endog df["re1"] = exog_re[:, 0] df["re2"] = exog_re[:, 1] df["vc1"] = ex_vc[:, 0] df["vc2"] = ex_vc[:, 1] df["vc3"] = ex_vc[:, 2] df["vc4"] = ex_vc[:, 3] vc_formula = {"a": "0 + vc1 + vc2", "b": "0 + vc3 + vc4"} model2 = MixedLM.from_formula("y ~ x1", groups=groups, re_formula="0 + re1 + re2", vc_formula=vc_formula, data=df) result2 = model2.fit() assert_allclose(result1.fe_params, result2.fe_params, rtol=1e-8) assert_allclose(result1.cov_re, result2.cov_re, rtol=1e-8) assert_allclose(result1.vcomp, result2.vcomp, rtol=1e-8) assert_allclose(result1.params, result2.params, rtol=1e-8) assert_allclose(result1.bse, result2.bse, rtol=1e-8)
def test_vcomp_1(self): # Fit the same model using constrained random effects and # variance components. np.random.seed(4279) exog = np.random.normal(size=(400, 1)) exog_re = np.random.normal(size=(400, 2)) groups = np.kron(np.arange(100), np.ones(4)) slopes = np.random.normal(size=(100, 2)) slopes[:, 1] *= 2 slopes = np.kron(slopes, np.ones((4, 1))) * exog_re errors = slopes.sum(1) + np.random.normal(size=400) endog = exog.sum(1) + errors free = MixedLMParams(1, 2, 0) free.fe_params = np.ones(1) free.cov_re = np.eye(2) free.vcomp = np.zeros(0) model1 = MixedLM(endog, exog, groups, exog_re=exog_re) result1 = model1.fit(free=free) exog_vc = {"a": {}, "b": {}} for k, group in enumerate(model1.group_labels): ix = model1.row_indices[group] exog_vc["a"][group] = exog_re[ix, 0:1] exog_vc["b"][group] = exog_re[ix, 1:2] with pytest.warns(UserWarning, match="Using deprecated variance"): model2 = MixedLM(endog, exog, groups, exog_vc=exog_vc) result2 = model2.fit() result2.summary() assert_allclose(result1.fe_params, result2.fe_params, atol=1e-4) assert_allclose(np.diag(result1.cov_re), result2.vcomp, atol=1e-2, rtol=1e-4) assert_allclose(result1.bse[[0, 1, 3]], result2.bse, atol=1e-2, rtol=1e-2)
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300,4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1,1,1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1,1,1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:,k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") rslt3 = mod3.fit(start_params=rslt2.params) assert_allclose(rslt1.params, rslt3.params, rtol=1e-4) # Check default variance structure with non-formula model # creation. exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) rslt4 = mod4.fit(start_params=rslt2.params) from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") rslt5 = mod5.fit(start_params=rslt2.params) assert_almost_equal(rslt4.params, rslt5.params)
def test_formulas(self): np.random.seed(2410) exog = np.random.normal(size=(300, 4)) exog_re = np.random.normal(size=300) groups = np.kron(np.arange(100), [1, 1, 1]) g_errors = exog_re * np.kron(np.random.normal(size=100), [1, 1, 1]) endog = exog.sum(1) + g_errors + np.random.normal(size=300) mod1 = MixedLM(endog, exog, groups, exog_re) rslt1 = mod1.fit() # Fit with a formula, passing groups as the actual values. df = pd.DataFrame({"endog": endog}) for k in range(exog.shape[1]): df["exog%d" % k] = exog[:, k] df["exog_re"] = exog_re fml = "endog ~ 0 + exog0 + exog1 + exog2 + exog3" re_fml = "0 + exog_re" mod2 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups=groups) rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) # Fit with a formula, passing groups as the variable name. df["groups"] = groups mod3 = MixedLM.from_formula(fml, df, re_formula=re_fml, groups="groups") rslt3 = mod3.fit(start_params=rslt2.params) assert_almost_equal(rslt1.params, rslt3.params, decimal=5) # Check default variance structure with formula.api exog_re = np.ones(len(endog), dtype=np.float64) mod4 = MixedLM(endog, exog, groups, exog_re) rslt4 = mod4.fit(start_params=rslt2.params) from statsmodels.formula.api import mixedlm mod5 = mixedlm(fml, df, groups="groups") rslt5 = mod5.fit(start_params=rslt2.params) assert_almost_equal(rslt4.params, rslt5.params)
def test_vcomp_formula(self): np.random.seed(6241) n = 800 exog = np.random.normal(size=(n, 2)) exog[:, 0] = 1 ex_vc = [] groups = np.kron(np.arange(n / 4), np.ones(4)) errors = 0 exog_re = np.random.normal(size=(n, 2)) slopes = np.random.normal(size=(n // 4, 2)) slopes = np.kron(slopes, np.ones((4, 1))) * exog_re errors += slopes.sum(1) ex_vc = np.random.normal(size=(n, 4)) slopes = np.random.normal(size=(n // 4, 4)) slopes[:, 2:] *= 2 slopes = np.kron(slopes, np.ones((4, 1))) * ex_vc errors += slopes.sum(1) errors += np.random.normal(size=n) endog = exog.sum(1) + errors exog_vc = {"a": {}, "b": {}} for k, group in enumerate(range(int(n / 4))): ix = np.flatnonzero(groups == group) exog_vc["a"][group] = ex_vc[ix, 0:2] exog_vc["b"][group] = ex_vc[ix, 2:] model1 = MixedLM(endog, exog, groups, exog_re=exog_re, exog_vc=exog_vc) result1 = model1.fit() df = pd.DataFrame(exog[:, 1:], columns=["x1"]) df["y"] = endog df["re1"] = exog_re[:, 0] df["re2"] = exog_re[:, 1] df["vc1"] = ex_vc[:, 0] df["vc2"] = ex_vc[:, 1] df["vc3"] = ex_vc[:, 2] df["vc4"] = ex_vc[:, 3] vc_formula = {"a": "0 + vc1 + vc2", "b": "0 + vc3 + vc4"} model2 = MixedLM.from_formula( "y ~ x1", groups=groups, re_formula="0 + re1 + re2", vc_formula=vc_formula, data=df) result2 = model2.fit() assert_allclose(result1.fe_params, result2.fe_params, rtol=1e-8) assert_allclose(result1.cov_re, result2.cov_re, rtol=1e-8) assert_allclose(result1.vcomp, result2.vcomp, rtol=1e-8) assert_allclose(result1.params, result2.params, rtol=1e-8) assert_allclose(result1.bse, result2.bse, rtol=1e-8)
def train(self): vent_train_data, adas_train_data = self.get_dataset() #---- Ventricle model rid, X, y = vent_train_data intercepts = np.ones((X.shape[0], 1)) intercepts_df = pd.DataFrame({'INT': intercepts.reshape(-1)}) squared_term = pd.DataFrame( {'YBL_SQ': X['YEARS_FROM_BL'].apply(np.square).values.reshape(-1)}) X_int = X.reset_index(drop=True).join(intercepts_df).join(squared_term) # X_int[self.fe_features + self.re_features] = self.scaler.fit_transform( # X_int[self.fe_features + self.re_features]) model = MixedLM(endog=y.values, exog=X_int[['INT'] + self.fe_features + ['YBL_SQ']], groups=rid, exog_re=X_int[['INT'] + self.re_features]) results_vent = model.fit() #----- Adas model rid, X, y = adas_train_data intercepts = np.ones((X.shape[0], 1)) intercepts_df = pd.DataFrame({'INT': intercepts.reshape(-1)}) squared_term = pd.DataFrame( {'YBL_SQ': X['YEARS_FROM_BL'].apply(np.square).values.reshape(-1)}) X_int = X.reset_index(drop=True).join(intercepts_df).join(squared_term) # X_int[self.fe_features + self.re_features] = self.scaler.fit_transform( # X_int[self.fe_features + self.re_features]) model = MixedLM(endog=y.values, exog=X_int[['INT'] + self.fe_features + ['YBL_SQ']], groups=rid, exog_re=X_int[['INT'] + self.re_features]) results_adas = model.fit() return results_vent, results_adas
def predict_ventricles(data_forecast, most_recent_data, feature_list): # * Ventricles volume forecast: = most recent measurement, default confidence interval most_recent_Ventricles_ICV = most_recent_data['Ventricles_ICV'].dropna( ).tail(1).iloc[0] vent_mask = (most_recent_data['Ventricles_ICV'].dropna() > 0) & ( most_recent_data['AGE_AT_EXAM'].dropna() > 0 ) # not missing: Ventricles and ICV test_subject = most_recent_data['AGE_AT_EXAM'].dropna()[vent_mask] y = most_recent_data['Ventricles_ICV'].dropna()[vent_mask] # Regress the individual subjects # Normalize the targets: how much does it deviate from the mean at this age? data_grouped = most_recent_data.dropna()[vent_mask].groupby("RID") # Go through all the subjects and build up huge feature matrix fixed_effects = list() random_effects = list() groups = list() ys = list() for ctr, (rid, subject) in enumerate(data_grouped): num_measurements = len(subject) fixed_effect, random_effect, y = get_mixed_effects( subject, feature_list) fixed_effects.append(fixed_effect) random_effects.append(random_effect) groups.extend(num_measurements * [rid]) ys.append(y) fixed_effects = pd.concat(fixed_effects, axis=0) random_effects = pd.concat(random_effects, axis=0) ys = pd.concat(ys, axis=0) model = MixedLM(ys, fixed_effects, groups, exog_re=random_effects) result = model.fit() print(result.summary()) for rid, test_subject in tqdm.tqdm(data_grouped): t_last = test_subject["AGE_AT_EXAM"].max( ) - test_subject["AGE_AT_EXAM"].min() fixed_effect, _, _ = get_mixed_effects(test_subject, feature_list) # Get future time points (after last visit) dates_forecast = t_last + data_forecast[data_forecast["RID"] == rid]["Forecast Month"] / 12 # TODO reshape should be generic vent_forecasts = list() vent_std = list() std = 0.02 # for date_forecast in dates_forecast: vent_forecasts = predict_lme(result, rid, dates_forecast, fixed_effect.iloc[0]) vent_std = std # TODO # Postprocessing vent_forecasts = np.array(vent_forecasts) min_v = 0.9 * most_recent_data["Ventricles_ICV"].min() max_v = 2 * most_recent_data["Ventricles_ICV"].max() vent_forecasts = np.maximum(vent_forecasts, min_v) vent_forecasts = np.minimum(vent_forecasts, max_v) data_forecast.loc[data_forecast["RID"] == rid, 'Ventricles_ICV'] = vent_forecasts # 50% CI. Phi(50%) = 0.75 -> 50% of the data lie within 0.75 * sigma around the mean data_forecast.loc[data_forecast["RID"] == rid, 'Ventricles_ICV 50% CI lower'] = np.max( np.array(vent_forecasts) - 0.75 * std, 0) data_forecast.loc[data_forecast["RID"] == rid, 'Ventricles_ICV 50% CI upper'] = np.max( np.array(vent_forecasts) + 0.75 * std, 0) return data_forecast
def test_compare_numdiff(self): n_grp = 200 grpsize = 5 k_fe = 3 k_re = 2 for use_sqrt in False, True: for reml in False, True: for profile_fe in False, True: np.random.seed(3558) exog_fe = np.random.normal(size=(n_grp * grpsize, k_fe)) exog_re = np.random.normal(size=(n_grp * grpsize, k_re)) exog_re[:, 0] = 1 exog_vc = np.random.normal(size=(n_grp * grpsize, 3)) slopes = np.random.normal(size=(n_grp, k_re)) slopes[:, -1] *= 2 slopes = np.kron(slopes, np.ones((grpsize, 1))) slopes_vc = np.random.normal(size=(n_grp, 3)) slopes_vc = np.kron(slopes_vc, np.ones((grpsize, 1))) slopes_vc[:, -1] *= 2 re_values = (slopes * exog_re).sum(1) vc_values = (slopes_vc * exog_vc).sum(1) err = np.random.normal(size=n_grp * grpsize) endog = exog_fe.sum(1) + re_values + vc_values + err groups = np.kron(range(n_grp), np.ones(grpsize)) vc = {"a": {}, "b": {}} for i in range(n_grp): ix = np.flatnonzero(groups == i) vc["a"][i] = exog_vc[ix, 0:2] vc["b"][i] = exog_vc[ix, 2:3] model = MixedLM(endog, exog_fe, groups, exog_re, exog_vc=vc, use_sqrt=use_sqrt) rslt = model.fit(reml=reml) loglike = loglike_function(model, profile_fe=profile_fe, has_fe=not profile_fe) # Test the score at several points. for kr in range(5): fe_params = np.random.normal(size=k_fe) cov_re = np.random.normal(size=(k_re, k_re)) cov_re = np.dot(cov_re.T, cov_re) vcomp = np.random.normal(size=2)**2 params = MixedLMParams.from_components(fe_params, cov_re=cov_re, vcomp=vcomp) params_vec = params.get_packed(has_fe=not profile_fe, use_sqrt=use_sqrt) # Check scores gr = -model.score(params, profile_fe=profile_fe) ngr = nd.approx_fprime(params_vec, loglike) assert_allclose(gr, ngr, rtol=1e-3) # Check Hessian matrices at the MLE (we don't have # the profile Hessian matrix and we don't care # about the Hessian for the square root # transformed parameter). if (profile_fe is False) and (use_sqrt is False): hess = -model.hessian(rslt.params_object) params_vec = rslt.params_object.get_packed( use_sqrt=False, has_fe=True) loglike_h = loglike_function(model, profile_fe=False, has_fe=True) nhess = nd.approx_hess(params_vec, loglike_h) assert_allclose(hess, nhess, rtol=1e-3)
def test_compare_numdiff(self): import statsmodels.tools.numdiff as nd n_grp = 200 grpsize = 5 k_fe = 3 k_re = 2 for jl in 0,1: for reml in False,True: for cov_pen_wt in 0,10: cov_pen = penalties.PSD(cov_pen_wt) np.random.seed(3558) exog_fe = np.random.normal(size=(n_grp*grpsize, k_fe)) exog_re = np.random.normal(size=(n_grp*grpsize, k_re)) exog_re[:, 0] = 1 slopes = np.random.normal(size=(n_grp, k_re)) slopes = np.kron(slopes, np.ones((grpsize,1))) re_values = (slopes * exog_re).sum(1) err = np.random.normal(size=n_grp*grpsize) endog = exog_fe.sum(1) + re_values + err groups = np.kron(range(n_grp), np.ones(grpsize)) if jl == 0: md = MixedLM(endog, exog_fe, groups, exog_re) score = lambda x: -md.score_sqrt(x) hessian = lambda x : -md.hessian_sqrt(x) else: md = MixedLM(endog, exog_fe, groups, exog_re, use_sqrt=False) score = lambda x: -md.score_full(x) hessian = lambda x: -md.hessian_full(x) md.reml = reml md.cov_pen = cov_pen loglike = lambda x: -md.loglike(x) rslt = md.fit() # Test the score at several points. for kr in range(5): fe_params = np.random.normal(size=k_fe) cov_re = np.random.normal(size=(k_re, k_re)) cov_re = np.dot(cov_re.T, cov_re) params = MixedLMParams.from_components(fe_params, cov_re) if jl == 0: params_vec = params.get_packed() else: params_vec = params.get_packed(use_sqrt=False) # Check scores gr = score(params) ngr = nd.approx_fprime(params_vec, loglike) assert_allclose(gr, ngr, rtol=1e-2) # Hessian matrices don't agree well away from # the MLE. #if cov_pen_wt == 0: # hess = hessian(params) # nhess = nd.approx_hess(params_vec, loglike) # assert_allclose(hess, nhess, rtol=1e-2) # Check Hessian matrices at the MLE. if cov_pen_wt == 0: hess = hessian(rslt.params_object) params_vec = rslt.params_object.get_packed() nhess = nd.approx_hess(params_vec, loglike) assert_allclose(hess, nhess, rtol=1e-2)
def test_compare_numdiff(self, use_sqrt, reml, profile_fe): n_grp = 200 grpsize = 5 k_fe = 3 k_re = 2 np.random.seed(3558) exog_fe = np.random.normal(size=(n_grp * grpsize, k_fe)) exog_re = np.random.normal(size=(n_grp * grpsize, k_re)) exog_re[:, 0] = 1 exog_vc = np.random.normal(size=(n_grp * grpsize, 3)) slopes = np.random.normal(size=(n_grp, k_re)) slopes[:, -1] *= 2 slopes = np.kron(slopes, np.ones((grpsize, 1))) slopes_vc = np.random.normal(size=(n_grp, 3)) slopes_vc = np.kron(slopes_vc, np.ones((grpsize, 1))) slopes_vc[:, -1] *= 2 re_values = (slopes * exog_re).sum(1) vc_values = (slopes_vc * exog_vc).sum(1) err = np.random.normal(size=n_grp * grpsize) endog = exog_fe.sum(1) + re_values + vc_values + err groups = np.kron(range(n_grp), np.ones(grpsize)) vc = {"a": {}, "b": {}} for i in range(n_grp): ix = np.flatnonzero(groups == i) vc["a"][i] = exog_vc[ix, 0:2] vc["b"][i] = exog_vc[ix, 2:3] model = MixedLM(endog, exog_fe, groups, exog_re, exog_vc=vc, use_sqrt=use_sqrt) rslt = model.fit(reml=reml) loglike = loglike_function(model, profile_fe=profile_fe, has_fe=not profile_fe) try: # Test the score at several points. for kr in range(5): fe_params = np.random.normal(size=k_fe) cov_re = np.random.normal(size=(k_re, k_re)) cov_re = np.dot(cov_re.T, cov_re) vcomp = np.random.normal(size=2)**2 params = MixedLMParams.from_components(fe_params, cov_re=cov_re, vcomp=vcomp) params_vec = params.get_packed(has_fe=not profile_fe, use_sqrt=use_sqrt) # Check scores gr = -model.score(params, profile_fe=profile_fe) ngr = nd.approx_fprime(params_vec, loglike) assert_allclose(gr, ngr, rtol=1e-3) # Check Hessian matrices at the MLE (we do not have # the profile Hessian matrix and we do not care # about the Hessian for the square root # transformed parameter). if (profile_fe is False) and (use_sqrt is False): hess = -model.hessian(rslt.params_object) params_vec = rslt.params_object.get_packed(use_sqrt=False, has_fe=True) loglike_h = loglike_function(model, profile_fe=False, has_fe=True) nhess = nd.approx_hess(params_vec, loglike_h) assert_allclose(hess, nhess, rtol=1e-3) except AssertionError: # See GH#5628; because this test fails unpredictably but only on # OSX, we only xfail it there. if PLATFORM_OSX: pytest.xfail("fails on OSX due to unresolved " "numerical differences") else: raise
def test_compare_numdiff(self): n_grp = 200 grpsize = 5 k_fe = 3 k_re = 2 for use_sqrt in False, True: for reml in False, True: for profile_fe in False, True: np.random.seed(3558) exog_fe = np.random.normal(size=(n_grp * grpsize, k_fe)) exog_re = np.random.normal(size=(n_grp * grpsize, k_re)) exog_re[:, 0] = 1 exog_vc = np.random.normal(size=(n_grp * grpsize, 3)) slopes = np.random.normal(size=(n_grp, k_re)) slopes[:, -1] *= 2 slopes = np.kron(slopes, np.ones((grpsize, 1))) slopes_vc = np.random.normal(size=(n_grp, 3)) slopes_vc = np.kron(slopes_vc, np.ones((grpsize, 1))) slopes_vc[:, -1] *= 2 re_values = (slopes * exog_re).sum(1) vc_values = (slopes_vc * exog_vc).sum(1) err = np.random.normal(size=n_grp * grpsize) endog = exog_fe.sum(1) + re_values + vc_values + err groups = np.kron(range(n_grp), np.ones(grpsize)) vc = {"a": {}, "b": {}} for i in range(n_grp): ix = np.flatnonzero(groups == i) vc["a"][i] = exog_vc[ix, 0:2] vc["b"][i] = exog_vc[ix, 2:3] model = MixedLM(endog, exog_fe, groups, exog_re, exog_vc=vc, use_sqrt=use_sqrt) rslt = model.fit(reml=reml) loglike = loglike_function(model, profile_fe=profile_fe, has_fe=not profile_fe) # Test the score at several points. for kr in range(5): fe_params = np.random.normal(size=k_fe) cov_re = np.random.normal(size=(k_re, k_re)) cov_re = np.dot(cov_re.T, cov_re) vcomp = np.random.normal(size=2) ** 2 params = MixedLMParams.from_components(fe_params, cov_re=cov_re, vcomp=vcomp) params_vec = params.get_packed(has_fe=not profile_fe, use_sqrt=use_sqrt) # Check scores gr = -model.score(params, profile_fe=profile_fe) ngr = nd.approx_fprime(params_vec, loglike) assert_allclose(gr, ngr, rtol=1e-3) # Check Hessian matrices at the MLE (we don't have # the profile Hessian matrix and we don't care # about the Hessian for the square root # transformed parameter). if (profile_fe is False) and (use_sqrt is False): hess = -model.hessian(rslt.params_object) params_vec = rslt.params_object.get_packed(use_sqrt=False, has_fe=True) loglike_h = loglike_function(model, profile_fe=False, has_fe=True) nhess = nd.approx_hess(params_vec, loglike_h) assert_allclose(hess, nhess, rtol=1e-3)
def test_compare_numdiff(self, use_sqrt, reml, profile_fe): n_grp = 200 grpsize = 5 k_fe = 3 k_re = 2 np.random.seed(3558) exog_fe = np.random.normal(size=(n_grp * grpsize, k_fe)) exog_re = np.random.normal(size=(n_grp * grpsize, k_re)) exog_re[:, 0] = 1 exog_vc = np.random.normal(size=(n_grp * grpsize, 3)) slopes = np.random.normal(size=(n_grp, k_re)) slopes[:, -1] *= 2 slopes = np.kron(slopes, np.ones((grpsize, 1))) slopes_vc = np.random.normal(size=(n_grp, 3)) slopes_vc = np.kron(slopes_vc, np.ones((grpsize, 1))) slopes_vc[:, -1] *= 2 re_values = (slopes * exog_re).sum(1) vc_values = (slopes_vc * exog_vc).sum(1) err = np.random.normal(size=n_grp * grpsize) endog = exog_fe.sum(1) + re_values + vc_values + err groups = np.kron(range(n_grp), np.ones(grpsize)) vc = {"a": {}, "b": {}} for i in range(n_grp): ix = np.flatnonzero(groups == i) vc["a"][i] = exog_vc[ix, 0:2] vc["b"][i] = exog_vc[ix, 2:3] model = MixedLM( endog, exog_fe, groups, exog_re, exog_vc=vc, use_sqrt=use_sqrt) rslt = model.fit(reml=reml) loglike = loglike_function( model, profile_fe=profile_fe, has_fe=not profile_fe) try: # Test the score at several points. for kr in range(5): fe_params = np.random.normal(size=k_fe) cov_re = np.random.normal(size=(k_re, k_re)) cov_re = np.dot(cov_re.T, cov_re) vcomp = np.random.normal(size=2)**2 params = MixedLMParams.from_components( fe_params, cov_re=cov_re, vcomp=vcomp) params_vec = params.get_packed( has_fe=not profile_fe, use_sqrt=use_sqrt) # Check scores gr = -model.score(params, profile_fe=profile_fe) ngr = nd.approx_fprime(params_vec, loglike) assert_allclose(gr, ngr, rtol=1e-3) # Check Hessian matrices at the MLE (we don't have # the profile Hessian matrix and we don't care # about the Hessian for the square root # transformed parameter). if (profile_fe is False) and (use_sqrt is False): hess = -model.hessian(rslt.params_object) params_vec = rslt.params_object.get_packed( use_sqrt=False, has_fe=True) loglike_h = loglike_function( model, profile_fe=False, has_fe=True) nhess = nd.approx_hess(params_vec, loglike_h) assert_allclose(hess, nhess, rtol=1e-3) except AssertionError: # See GH#5628; because this test fails unpredictably but only on # OSX, we only xfail it there. if PLATFORM_OSX: pytest.xfail("fails on OSX due to unresolved " "numerical differences") else: raise
def test_compare_numdiff(self): import statsmodels.tools.numdiff as nd n_grp = 200 grpsize = 5 k_fe = 3 k_re = 2 for jl in 0, 1: for reml in False, True: for cov_pen_wt in 0, 10: cov_pen = penalties.PSD(cov_pen_wt) np.random.seed(3558) exog_fe = np.random.normal(size=(n_grp * grpsize, k_fe)) exog_re = np.random.normal(size=(n_grp * grpsize, k_re)) exog_re[:, 0] = 1 slopes = np.random.normal(size=(n_grp, k_re)) slopes = np.kron(slopes, np.ones((grpsize, 1))) re_values = (slopes * exog_re).sum(1) err = np.random.normal(size=n_grp * grpsize) endog = exog_fe.sum(1) + re_values + err groups = np.kron(range(n_grp), np.ones(grpsize)) if jl == 0: md = MixedLM(endog, exog_fe, groups, exog_re) score = lambda x: -md.score_sqrt(x) hessian = lambda x: -md.hessian_sqrt(x) else: md = MixedLM(endog, exog_fe, groups, exog_re, use_sqrt=False) score = lambda x: -md.score_full(x) hessian = lambda x: -md.hessian_full(x) md.reml = reml md.cov_pen = cov_pen loglike = lambda x: -md.loglike(x) rslt = md.fit() # Test the score at several points. for kr in range(5): fe_params = np.random.normal(size=k_fe) cov_re = np.random.normal(size=(k_re, k_re)) cov_re = np.dot(cov_re.T, cov_re) params = MixedLMParams.from_components( fe_params, cov_re) if jl == 0: params_vec = params.get_packed() else: params_vec = params.get_packed(use_sqrt=False) # Check scores gr = score(params) ngr = nd.approx_fprime(params_vec, loglike) assert_allclose(gr, ngr, rtol=1e-2) # Hessian matrices don't agree well away from # the MLE. #if cov_pen_wt == 0: # hess = hessian(params) # nhess = nd.approx_hess(params_vec, loglike) # assert_allclose(hess, nhess, rtol=1e-2) # Check Hessian matrices at the MLE. if cov_pen_wt == 0: hess = hessian(rslt.params_object) params_vec = rslt.params_object.get_packed() nhess = nd.approx_hess(params_vec, loglike) assert_allclose(hess, nhess, rtol=1e-2)