def setup_class(cls): np.random.seed(4321) n = 20 p = 5 exog = np.empty((n, p)) exog[:, 0] = 1 exog[:, 1] = np.random.randint(low=-5, high=5, size=n) x = np.repeat(np.array([1, 2, 3, 4]), n / 4) exog[:, 2:] = get_dummies(x) beta = np.array([-1, 0.1, -0.05, .2, 0.35]) lin_pred = (exog * beta).sum(axis=1) family = sm.families.Binomial link = sm.families.links.log endog = gen_endog(lin_pred, family, link, binom_version=0) mod1 = sm.GLM(endog, exog, family=family(link=link())) cls.res1 = mod1.fit(rtol=1e-10, atol=0, tol_criterion='params', scaletype='x2') agg = pd.DataFrame(exog) agg['endog'] = endog agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']] agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']] agg_exog = np.array(agg_endog.index.tolist()) agg_wt = agg_wt['endog'] avg_endog = agg_endog['endog'] / agg_wt mod2 = sm.GLM(avg_endog, agg_exog, family=family(link=link()), var_weights=agg_wt) cls.res2 = mod2.fit(rtol=1e-10, atol=0, tol_criterion='params')
def setup_class(cls): np.random.seed(4321) n = 10000 p = 5 exog = np.empty((n, p)) exog[:, 0] = 1 exog[:, 1] = np.random.randint(low=-5, high=5, size=n) x = np.repeat(np.array([1, 2, 3, 4]), n / 4) exog[:, 2:] = get_dummies(x) beta = np.array([7, 0.1, -0.05, .2, 0.35]) lin_pred = (exog * beta).sum(axis=1) family = sm.families.Tweedie link = sm.families.links.log endog = gen_endog(lin_pred, family, link) mod1 = sm.GLM(endog, exog, family=family(link=link(), var_power=1.5)) cls.res1 = mod1.fit(rtol=1e-20, atol=0, tol_criterion='params') agg = pd.DataFrame(exog) agg['endog'] = endog agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']] agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']] agg_exog = np.array(agg_endog.index.tolist()) agg_wt = agg_wt['endog'] agg_endog = agg_endog['endog'] mod2 = sm.GLM(agg_endog, agg_exog, family=family(link=link(), var_power=1.5), exposure=agg_wt, var_weights=agg_wt ** 0.5) cls.res2 = mod2.fit(rtol=1e-20, atol=0, tol_criterion='params')
def setup_class(cls): np.random.seed(4321) n = 10000 p = 5 exog = np.empty((n, p)) exog[:, 0] = 1 exog[:, 1] = np.random.randint(low=-5, high=5, size=n) x = np.repeat(np.array([1, 2, 3, 4]), n / 4) exog[:, 2:] = get_dummies(x) beta = np.array([-1, 0.1, -0.05, .2, 0.35]) lin_pred = (exog * beta).sum(axis=1) family = sm.families.Poisson link = sm.families.links.log endog = gen_endog(lin_pred, family, link) mod1 = sm.GLM(endog, exog, family=family(link=link)) cls.res1 = mod1.fit() agg = pd.DataFrame(exog) agg['endog'] = endog agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']] agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']] agg_exog = np.array(agg_endog.index.tolist()) agg_wt = agg_wt['endog'] avg_endog = agg_endog['endog'] / agg_wt mod2 = sm.GLM(avg_endog, agg_exog, family=family(link=link), var_weights=agg_wt) cls.res2 = mod2.fit()
def setup_class(cls): np.random.seed(4321) n = 10000 p = 5 exog = np.empty((n, p)) exog[:, 0] = 1 exog[:, 1] = np.random.randint(low=-5, high=5, size=n) x = np.repeat(np.array([1, 2, 3, 4]), n / 4) exog[:, 2:] = get_dummies(x) beta = np.array([-1, 0.1, -0.05, .2, 0.35]) lin_pred = (exog * beta).sum(axis=1) family = sm.families.Binomial link = sm.families.links.log endog = gen_endog(lin_pred, family, link, binom_version=0) wt = np.random.randint(1, 5, n) mod1 = sm.GLM(endog, exog, family=family(link=link), freq_weights=wt) cls.res1 = mod1.fit() exog_dup = np.repeat(exog, wt, axis=0) endog_dup = np.repeat(endog, wt) mod2 = sm.GLM(endog_dup, exog_dup, family=family(link=link)) cls.res2 = mod2.fit()
def setup_class(cls): from .results.results_glm import CancerLog res2 = CancerLog endog = res2.endog exog = res2.exog[:, :-1] exog = sm.add_constant(exog, prepend=True) aweights = np.repeat(1, len(endog)) aweights[::5] = 5 aweights[::13] = 3 model = sm.GLM(endog, exog, family=sm.families.Gamma(link=sm.families.links.log()), var_weights=aweights) cls.res1 = model.fit(rtol=1e-25, atol=0)
def setup(self): # fit for each test, because results will be changed by test x = self.exog np.random.seed(987689) y = x.sum(1) + np.random.randn(x.shape[0]) self.results = sm.GLM(y, self.exog).fit()
def test_gradient_irls(): # Compare the results when using gradient optimization and IRLS. # TODO: Find working examples for inverse_squared link np.random.seed(87342) fams = [(sm.families.Binomial, [links.logit, links.probit, links.cloglog, links.log, links.cauchy]), (sm.families.Poisson, [links.log, links.identity, links.sqrt]), (sm.families.Gamma, [links.log, links.identity, links.inverse_power]), (sm.families.Gaussian, [links.identity, links.log, links.inverse_power]), (sm.families.InverseGaussian, [links.log, links.identity, links.inverse_power, links.inverse_squared]), (sm.families.NegativeBinomial, [links.log, links.inverse_power, links.inverse_squared, links.identity])] n = 100 p = 3 exog = np.random.normal(size=(n, p)) exog[:, 0] = 1 skip_one = False for family_class, family_links in fams: for link in family_links: for binom_version in [0, 1]: if family_class != sm.families.Binomial and binom_version == 1: continue if (family_class, link) == (sm.families.Poisson, links.identity): lin_pred = 20 + exog.sum(1) elif (family_class, link) == (sm.families.Binomial, links.log): lin_pred = -1 + exog.sum(1) / 8 elif (family_class, link) == (sm.families.Poisson, links.sqrt): lin_pred = 2 + exog.sum(1) elif (family_class, link) == (sm.families.InverseGaussian, links.log): #skip_zero = True lin_pred = -1 + exog.sum(1) elif (family_class, link) == (sm.families.InverseGaussian, links.identity): lin_pred = 20 + 5 * exog.sum(1) lin_pred = np.clip(lin_pred, 1e-4, np.inf) elif (family_class, link) == (sm.families.InverseGaussian, links.inverse_squared): lin_pred = 0.5 + exog.sum(1) / 5 continue # skip due to non-convergence elif (family_class, link) == (sm.families.InverseGaussian, links.inverse_power): lin_pred = 1 + exog.sum(1) / 5 elif (family_class, link) == (sm.families.NegativeBinomial, links.identity): lin_pred = 20 + 5 * exog.sum(1) lin_pred = np.clip(lin_pred, 1e-4, np.inf) elif (family_class, link) == (sm.families.NegativeBinomial, links.inverse_squared): lin_pred = 0.1 + np.random.uniform(size=exog.shape[0]) continue # skip due to non-convergence elif (family_class, link) == (sm.families.NegativeBinomial, links.inverse_power): lin_pred = 1 + exog.sum(1) / 5 elif (family_class, link) == (sm.families.Gaussian, links.inverse_power): # adding skip because of convergence failure skip_one = True # GH#4620 # the following fails with identity link, because endog < 0 # elif family_class == fam.Gamma: # lin_pred = (0.5 * exog.sum(1) + # np.random.uniform(size=exog.shape[0])) else: lin_pred = np.random.uniform(size=exog.shape[0]) endog = gen_endog(lin_pred, family_class, link, binom_version) with warnings.catch_warnings(): warnings.simplefilter("ignore") mod_irls = sm.GLM(endog, exog, family=family_class(link=link())) rslt_irls = mod_irls.fit(method="IRLS") if (family_class, link) not in [(sm.families.Poisson, links.sqrt), (sm.families.Gamma, links.inverse_power), (sm.families.InverseGaussian, links.identity)]: # GH#4620 check_score_hessian(rslt_irls) # Try with and without starting values. for max_start_irls, start_params in [(0, rslt_irls.params), (3, None)]: # TODO: skip convergence failures for now if max_start_irls > 0 and skip_one: continue with warnings.catch_warnings(): warnings.simplefilter("ignore") mod_gradient = sm.GLM(endog, exog, family=family_class(link=link())) rslt_gradient = mod_gradient.fit( max_start_irls=max_start_irls, start_params=start_params, method="newton", maxiter=300) assert_allclose(rslt_gradient.params, rslt_irls.params, rtol=1e-6, atol=5e-5) assert_allclose(rslt_gradient.llf, rslt_irls.llf, rtol=1e-6, atol=1e-6) assert_allclose(rslt_gradient.scale, rslt_irls.scale, rtol=1e-6, atol=1e-6) # Get the standard errors using expected information. gradient_bse = rslt_gradient.bse ehess = mod_gradient.hessian(rslt_gradient.params, observed=False) gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess))) assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6, atol=5e-5)
def test_wtd_gradient_irls(): # Compare the results when using gradient optimization and IRLS. # TODO: Find working examples for inverse_squared link np.random.seed(87342) fam = sm.families lnk = sm.families.links families = [(fam.Binomial, [lnk.logit, lnk.probit, lnk.cloglog, lnk.log, lnk.cauchy]), (fam.Poisson, [lnk.log, lnk.identity, lnk.sqrt]), (fam.Gamma, [lnk.log, lnk.identity, lnk.inverse_power]), (fam.Gaussian, [lnk.identity, lnk.log, lnk.inverse_power]), (fam.InverseGaussian, [lnk.log, lnk.identity, lnk.inverse_power, lnk.inverse_squared]), (fam.NegativeBinomial, [lnk.log, lnk.inverse_power, lnk.inverse_squared, lnk.identity])] n = 100 p = 3 exog = np.random.normal(size=(n, p)) exog[:, 0] = 1 skip_one = False for family_class, family_links in families: for link in family_links: for binom_version in [0, 1]: method = 'bfgs' if family_class != fam.Binomial and binom_version == 1: continue elif family_class == fam.Binomial and link == lnk.cloglog: # Can't get gradient to converage with var_weights here continue elif family_class == fam.Binomial and link == lnk.log: # Can't get gradient to converage with var_weights here continue elif (family_class, link) == (fam.Poisson, lnk.identity): lin_pred = 20 + exog.sum(1) elif (family_class, link) == (fam.Binomial, lnk.log): lin_pred = -1 + exog.sum(1) / 8 elif (family_class, link) == (fam.Poisson, lnk.sqrt): lin_pred = -2 + exog.sum(1) elif (family_class, link) == (fam.Gamma, lnk.log): # Can't get gradient to converge with var_weights here continue elif (family_class, link) == (fam.Gamma, lnk.identity): # Can't get gradient to converage with var_weights here continue elif (family_class, link) == (fam.Gamma, lnk.inverse_power): # Can't get gradient to converage with var_weights here continue elif (family_class, link) == (fam.Gaussian, lnk.log): # Can't get gradient to converage with var_weights here continue elif (family_class, link) == (fam.Gaussian, lnk.inverse_power): # Can't get gradient to converage with var_weights here continue elif (family_class, link) == (fam.InverseGaussian, lnk.log): # Can't get gradient to converage with var_weights here lin_pred = -1 + exog.sum(1) continue elif (family_class, link) == (fam.InverseGaussian, lnk.identity): # Can't get gradient to converage with var_weights here lin_pred = 20 + 5 * exog.sum(1) lin_pred = np.clip(lin_pred, 1e-4, np.inf) continue elif (family_class, link) == (fam.InverseGaussian, lnk.inverse_squared): lin_pred = 0.5 + exog.sum(1) / 5 continue # skip due to non-convergence elif (family_class, link) == (fam.InverseGaussian, lnk.inverse_power): lin_pred = 1 + exog.sum(1) / 5 method = 'newton' elif (family_class, link) == (fam.NegativeBinomial, lnk.identity): lin_pred = 20 + 5 * exog.sum(1) lin_pred = np.clip(lin_pred, 1e-3, np.inf) method = 'newton' elif (family_class, link) == (fam.NegativeBinomial, lnk.inverse_squared): lin_pred = 0.1 + np.random.uniform(size=exog.shape[0]) continue # skip due to non-convergence elif (family_class, link) == (fam.NegativeBinomial, lnk.inverse_power): # Can't get gradient to converage with var_weights here lin_pred = 1 + exog.sum(1) / 5 continue elif (family_class, link) == (fam.Gaussian, lnk.inverse_power): # adding skip because of convergence failure skip_one = True else: lin_pred = np.random.uniform(size=exog.shape[0]) endog = gen_endog(lin_pred, family_class, link, binom_version) if binom_version == 0: wts = np.ones_like(endog) tmp = np.random.randint(2, 5, size=(endog > endog.mean()).sum()) wts[endog > endog.mean()] = tmp else: wts = np.ones(shape=endog.shape[0]) y = endog[:, 0] / endog.sum(axis=1) tmp = np.random.gamma(2, size=(y > y.mean()).sum()) wts[y > y.mean()] = tmp with warnings.catch_warnings(): warnings.simplefilter("ignore") mod_irls = sm.GLM(endog, exog, var_weights=wts, family=family_class(link=link())) rslt_irls = mod_irls.fit(method="IRLS", atol=1e-10, tol_criterion='params') # Try with and without starting values. for max_start_irls, start_params in ((0, rslt_irls.params), (3, None)): # TODO: skip convergence failures for now if max_start_irls > 0 and skip_one: continue with warnings.catch_warnings(): warnings.simplefilter("ignore") mod_gradient = sm.GLM(endog, exog, var_weights=wts, family=family_class(link=link())) rslt_gradient = mod_gradient.fit( max_start_irls=max_start_irls, start_params=start_params, method=method) assert_allclose(rslt_gradient.params, rslt_irls.params, rtol=1e-6, atol=5e-5) assert_allclose(rslt_gradient.llf, rslt_irls.llf, rtol=1e-6, atol=1e-6) assert_allclose(rslt_gradient.scale, rslt_irls.scale, rtol=1e-6, atol=1e-6) # Get the standard errors using expected information. gradient_bse = rslt_gradient.bse ehess = mod_gradient.hessian(rslt_gradient.params, observed=False) gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess))) assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6, atol=5e-5)