def test_zero_penalty(): x, y, poly = multivariate_sample_data() alphas = [0, 0] gam_gs = GLMGam(y, smoother=poly, alpha=alphas) gam_gs_res = gam_gs.fit() y_est_gam = gam_gs_res.predict() glm = GLM(y, poly.basis).fit() y_est = glm.predict() assert_allclose(y_est, y_est_gam)
def init(cls): cls.res2 = cls.mod2.fit() mod = GLM(cls.endog, cls.exogc, offset=0.5 * cls.exog[:, cls.idx_c].squeeze()) mod.exog_names[:] = ['const', 'x2', 'x3', 'x4'] cls.res1 = mod.fit() cls.idx_p_uc = np.arange(cls.exogc.shape[1])
def ppglmfit(X,Y): ''' The GLM solver in statsmodels is very general. It accepts any link function and expects that, if you want a constant term in your model, that you have already manually added a column of ones to your design matrix. This wrapper simplifies using GLM to fit the common case of a Poisson point-process model, where the constant term has not been explicitly added to the design matrix Args: X: N_observations x N_features design matrix. Y: Binary point process observations Returns: μ, B: the offset and parameter estimates for the GLM model. ''' # add constant value to X, if the 1st column is not constant if mean(Y)>0.1: print('Caution: spike rate very high, is Poisson assumption valid?') if sum(Y)<100: print('Caution: fewer than 100 spikes to fit model') if not all(X[:,0]==X[0,0]): X = hstack([ ones((shape(X)[0],1),dtype=X.dtype), X]) poisson_model = GLM(Y,X,family=Poisson()) poisson_results = poisson_model.fit() M = poisson_results.params return M[0],M[1:]
def setup_class(cls): df = data_bin res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit(attach_wls=True, atol=1e-10) cls.infl1 = res.get_influence() cls.infl0 = MLEInfluence(res)
def test_cov_params(): np.random.seed(0) n = 1000 x = np.random.uniform(0, 1, (n, 2)) x = x - x.mean() y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n) y -= y.mean() bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center') alpha = [0, 0] glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0, disp=0, maxiter=5000) glm = GLM(y, bsplines.basis) res_glm = glm.fit() assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(), rtol=0.0025) alpha = 1e-13 glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha) res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0, disp=0, maxiter=5000) assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(), atol=1e-10) res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0, disp=0, maxiter=5000, maxfun=5000) assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(), rtol=1e-4, atol=1e-8)
def test_poisson_residuals(): nobs, k_exog = 100, 5 np.random.seed(987125) x = np.random.randn(nobs, k_exog - 1) x = add_constant(x) y_true = x.sum(1) / 2 y = y_true + 2 * np.random.randn(nobs) exposure = 1 + np.arange(nobs) // 4 yp = np.random.poisson(np.exp(y_true) * exposure) yp[10:15] += 10 fam = sm.families.Poisson() mod_poi_e = GLM(yp, x, family=fam, exposure=exposure) res_poi_e = mod_poi_e.fit() mod_poi_w = GLM(yp / exposure, x, family=fam, var_weights=exposure) res_poi_w = mod_poi_w.fit() assert_allclose(res_poi_e.resid_response / exposure, res_poi_w.resid_response) assert_allclose(res_poi_e.resid_pearson, res_poi_w.resid_pearson) assert_allclose(res_poi_e.resid_deviance, res_poi_w.resid_deviance) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe) assert_allclose(res_poi_e.resid_anscombe_unscaled, res_poi_w.resid_anscombe)
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit(cov_type='HC1') cls.bse_rob = cls.res1.bse cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
def setup_class(cls): cls.cov_type = 'cluster' mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def setup_class(cls): cls.cov_type = 'HC0' mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HC0') mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HC0')
def test_glm(self): # prelimnimary, getting started with basic test for GLM.get_prediction from statsmodels.genmod.generalized_linear_model import GLM res_wls = self.res_wls mod_wls = res_wls.model y, X, wi = mod_wls.endog, mod_wls.exog, mod_wls.weights w_sqrt = np.sqrt(wi) # notation wi is weights, `w` is var mod_glm = GLM(y * w_sqrt, X * w_sqrt[:,None]) # compare using t distribution res_glm = mod_glm.fit(use_t=True) pred_glm = res_glm.get_prediction() sf_glm = pred_glm.summary_frame() pred_res_wls = res_wls.get_prediction() sf_wls = pred_res_wls.summary_frame() n_compare = 30 # in glm with predict wendog assert_allclose(sf_glm.values[:n_compare], sf_wls.values[:n_compare, :4]) # compare using normal distribution res_glm = mod_glm.fit() # default use_t=False pred_glm = res_glm.get_prediction() sf_glm = pred_glm.summary_frame() res_wls = mod_wls.fit(use_t=False) pred_res_wls = res_wls.get_prediction() sf_wls = pred_res_wls.summary_frame() assert_allclose(sf_glm.values[:n_compare], sf_wls.values[:n_compare, :4]) # function for parameter transformation # should be separate test method from statsmodels.genmod._prediction import params_transform_univariate rates = params_transform_univariate(res_glm.params, res_glm.cov_params()) rates2 = np.column_stack((np.exp(res_glm.params), res_glm.bse * np.exp(res_glm.params), np.exp(res_glm.conf_int()))) assert_allclose(rates.summary_frame().values, rates2, rtol=1e-13) from statsmodels.genmod.families import links # with identity transform pt = params_transform_univariate(res_glm.params, res_glm.cov_params(), link=links.identity()) assert_allclose(pt.tvalues, res_glm.tvalues, rtol=1e-13) assert_allclose(pt.se_mean, res_glm.bse, rtol=1e-13) ptt = pt.t_test() assert_allclose(ptt[0], res_glm.tvalues, rtol=1e-13) assert_allclose(ptt[1], res_glm.pvalues, rtol=1e-13) # prediction with exog and no weights does not error res_glm = mod_glm.fit() pred_glm = res_glm.get_prediction(X)
def init(cls): cov_type = 'HC0' cls.res2 = cls.mod2.fit(cov_type=cov_type) mod = GLM(cls.endog, cls.exogc, offset=0.5 * cls.exog[:, cls.idx_c].squeeze(), var_weights=cls.aweights) mod.exog_names[:] = ['const', 'x2', 'x3', 'x4'] cls.res1 = mod.fit(cov_type=cov_type) cls.idx_p_uc = np.arange(cls.exogc.shape[1])
def setup_class(cls): endog_bin = (endog > endog.mean()).astype(int) cls.cov_type = 'cluster' mod1 = GLM(endog_bin, exog, family=families.Binomial()) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod1 = smd.Logit(endog_bin, exog) cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def setup_class(cls): endog_bin = (endog > endog.mean()).astype(int) cls.cov_type = 'cluster' mod1 = GLM(endog_bin, exog, family=families.Gaussian(link=links.CDFLink())) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod1 = smd.Probit(endog_bin, exog) cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit() #res_hc0_ = cls.res1.get_robustcov_results('HC1') get_robustcov_results(cls.res1._results, 'HC1', use_self=True) cls.bse_rob = cls.res1.bse cls.corr_fact = cls.get_correction_factor(cls.res1, sub_kparams=False)
def setup_class(cls): cls.cov_type = 'HAC' kwds={'maxlags':2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit(cov_type='HC1') cls.bse_rob = cls.res1.bse nobs, k_vars = mod.exog.shape corr_fact = (nobs) / float(nobs - 1.) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(1./corr_fact)
def _initialize(cls): y, x = cls.y, cls.x modp = GLM(y, x, family=family.Poisson()) cls.res2 = modp.fit() mod = GLMPenalized(y, x, family=family.Poisson(), penal=cls.penalty) mod.pen_weight = 0 cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0) cls.atol = 5e-6
def setup_class(cls): endog_bin = (endog > endog.mean()).astype(int) cls.cov_type = 'cluster' mod1 = GLM(endog_bin, exog, family=families.Binomial(link=links.probit())) cls.res1 = mod1.fit(method='newton', cov_type='cluster', cov_kwds=dict(groups=group)) mod1 = smd.Probit(endog_bin, exog) cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) cls.rtol = 1e-6
def setup_class(cls): yi = np.array([0, 2, 14, 19, 30]) ni = 40 * np.ones(len(yi)) xi = np.arange(1, len(yi) + 1) exog = np.column_stack((np.ones(len(yi)), xi)) endog = np.column_stack((yi, ni - yi)) res = GLM(endog, exog, family=families.Binomial()).fit() cls.infl1 = res.get_influence() cls.infl0 = MLEInfluence(res) cls.cd_rtol = 5e-5
def setup_class(cls): cls.cov_type = 'HAC' # check kernel specified as string kwds = {'kernel': 'bartlett', 'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) kwds2 = {'maxlags': 2} cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds2)
def setup_class(cls): cls.res2 = results_st.results_poisson_hc1 mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit() #res_hc0_ = cls.res1.get_robustcov_results('HC1') get_robustcov_results(cls.res1._results, 'HC1', use_self=True) cls.bse_rob = cls.res1.bse nobs, k_vars = mod.exog.shape corr_fact = (nobs) / float(nobs - 1.) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(1./corr_fact)
def setup_class(cls): cls.cov_type = 'HAC' kwds={'kernel': sw.weights_uniform, 'maxlags': 2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) # check kernel as string mod2 = OLS(endog, exog) kwds2 = {'kernel': 'uniform', 'maxlags': 2} cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
def _initialize(cls): y, x = cls.y, cls.x modp = GLM(y, x[:, :cls.k_nonzero], family=family.Poisson()) cls.res2 = modp.fit() mod = GLMPenalized(y, x, family=family.Poisson(), penal=cls.penalty) mod.pen_weight *= 1.5 # same as discrete Poisson mod.penal.tau = 0.05 cls.res1 = mod.fit(method='bfgs', maxiter=100) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 5e-3
def _initialize(cls): y, x = cls.y, cls.x modp = GLM(y, x[:, :cls.k_nonzero], family=family.Binomial()) cls.res2 = modp.fit(disp=0) mod = GLMPenalized(y, x, family=family.Binomial(), penal=cls.penalty) mod.pen_weight *= .5 mod.penal.tau = 0.05 cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 5e-3
def setup_class(cls): cls.cov_type = 'HAC' kwds={'kernel':sw.weights_uniform, 'maxlags':2} mod1 = GLM(endog, exog, family=families.Gaussian()) cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds) #for debugging cls.res3 = mod2.fit(cov_type='HAC', cov_kwds={'maxlags':2})
def setup_class(cls): cls.cov_type = 'hac-panel' # time index is just made up to have a test case groups = np.repeat(np.arange(5), 7)[:-1] mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian()) kwds = dict(groups=pd.Series(groups), # check for #3606 maxlags=2, kernel=sw.weights_uniform, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
def setup_class(cls): cls.idx = slice(None) # params sequence same as Stata #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0) cls.res2 = reslogit.results_constraint2 mod1 = GLM(spector_data.endog, spector_data.exog, family=families.Binomial()) constr = 'x1 - x3 = 0' cls.res1m = mod1.fit_constrained(constr, atol=1e-10) R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10}) cls.constraints_rq = (R, q)
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = res1 = mod.fit() get_robustcov_results(cls.res1._results, 'cluster', groups=group, use_correction=True, df_correction=True, #TODO has no effect use_t=False, #True, use_self=True) cls.bse_rob = cls.res1.bse cls.corr_fact = cls.get_correction_factor(cls.res1)
def setup_class(cls): cls.cov_type = 'hac-groupsum' # time index is just made up to have a test case time = np.tile(np.arange(7), 5)[:-1] mod1 = GLM(endog, exog, family=families.Gaussian()) kwds = dict(time=pd.Series(time), # check for #3606 maxlags=2, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds) cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
def setup_class(cls): cls.cov_type = 'hac-panel' # time index is just made up to have a test case time = np.tile(np.arange(7), 5)[:-1] mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian()) kwds = dict(time=time, maxlags=2, kernel=sw.weights_uniform, use_correction='hac', df_correction=False) cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds) cls.res1b = mod1.fit(cov_type='nw-panel', cov_kwds=kwds) mod2 = OLS(endog, exog) cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
def __init__(self): ''' Test Binomial family with canonical logit link using star98 dataset. ''' self.decimal_resids = DECIMAL_1 self.decimal_bic = DECIMAL_2 from statsmodels.datasets.star98 import load from .results.results_glm import Star98 data = load() data.exog = add_constant(data.exog, prepend=False) self.res1 = GLM(data.endog, data.exog, \ family=sm.families.Binomial()).fit() #NOTE: if you want to replicate with RModel #res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm, # family=r.binomial, weights=trials) self.res2 = Star98()
def __init__(self): ''' Tests Poisson family with canonical log link. Test results were obtained by R. ''' from .results.results_glm import Cpunish from statsmodels.datasets.cpunish import load self.data = load() self.data.exog[:, 3] = np.log(self.data.exog[:, 3]) self.data.exog = add_constant(self.data.exog, prepend=False) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Poisson()).fit() self.res2 = Cpunish() # compare with discrete, start close to save time modd = discrete.Poisson(self.data.endog, self.data.exog) self.resd = modd.fit(start_params=self.res1.params * 0.9, disp=False)
def _delta_hat_estimation(self, temp_y, temp_x, temp_t): """Estimates delta to correct treatment estimation""" H_a = [] for idx, treatment in enumerate(np.asarray(temp_t)): if treatment == 1: H_a.append(1 / self.pi_hat1[idx]) elif treatment == 0: H_a.append(-1 / self.pi_hat0[idx]) H_a = np.array(H_a) # Create GLM using H_a as a forced offset targetting_model = GLM(endog=np.asarray(temp_y), exog=H_a, offset=np.asarray(self.y_hat_a)).fit() return targetting_model.params[0]
def setup_class(cls): from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_noexposure_constraint # 2 is dropped baseline for categorical cls.idx = [7, 3, 4, 5, 6, 0, 1] # example without offset formula = 'deaths ~ logpyears + smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson()) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, fit_kwds={'atol': 1e-10}) cls.constraints = lc cls.res1m = mod.fit_constrained(constr, atol=1e-10)
def setup_class(cls): fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs cls.corr_fact = np.sqrt((wsum - 1.) / wsum) cls.res1 = GLM( cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), freq_weights=fweights).fit( cov_type='HC0') #, cov_kwds={'use_correction':False}) # compare with discrete, start close to save time #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) cls.res2 = res_stata.results_poisson_fweight_hc1
def __init__(self): ''' Test Gaussian family with canonical identity link ''' # Test Precisions self.decimal_resids = DECIMAL_3 self.decimal_params = DECIMAL_2 self.decimal_bic = DECIMAL_0 self.decimal_bse = DECIMAL_3 from statsmodels.datasets.longley import load self.data = load() self.data.exog = add_constant(self.data.exog, prepend=False) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.Gaussian()).fit() from .results.results_glm import Longley self.res2 = Longley()
def test_warnings_raised(): weights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights weights = np.array(weights) gid = np.arange(1, 17 + 1) // 2 cov_kwds = {'groups': gid, 'use_correction': False} with pytest.warns(SpecificationWarning): res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), freq_weights=weights ).fit(cov_type='cluster', cov_kwds=cov_kwds) res1.summary() with pytest.warns(SpecificationWarning): res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=weights ).fit(cov_type='cluster', cov_kwds=cov_kwds) res1.summary()
def test_est_unregularized_naive(): # tests that the shape of all the intermediate steps # remains correct for unregularized naive estimation, # does this for OLS and GLM np.random.seed(435265) X = np.random.normal(size=(50, 3)) y = np.random.randint(0, 2, size=50) beta = np.random.normal(size=3) mod = OLS(y, X) res = _est_unregularized_naive(mod, 0, 2, fit_kwds={"alpha": 0.5}) assert_equal(res.shape, beta.shape) mod = GLM(y, X, family=Binomial()) res = _est_unregularized_naive(mod, 0, 2, fit_kwds={"alpha": 0.5}) assert_equal(res.shape, beta.shape)
def setup_class(cls): fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=aweights ).fit() # Need to copy to avoid inplace adjustment from copy import copy cls.res2 = copy(res_stata.results_poisson_aweight_nonrobust) cls.res2.resids = cls.res2.resids.copy() # Need to adjust resids for pearson and deviance to add weights cls.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])
def __init__(self): ''' Tests Gamma family with canonical inverse link (power -1) ''' # Test Precisions self.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata self.decimal_resids = DECIMAL_2 from statsmodels.datasets.scotland import load from results.results_glm import Scotvote data = load() data.exog = add_constant(data.exog) res1 = GLM(data.endog, data.exog, \ family=sm.families.Gamma()).fit() self.res1 = res1 # res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma) res2 = Scotvote() res2.aic_R += 2 # R doesn't count degree of freedom for scale with gamma self.res2 = res2
def init(self): nobs = self.nobs y_true, x, exog = self.y_true, self.x, self.exog if not hasattr(self, 'scale'): scale = 1 else: scale = self.scale f = self.family self.mu_true = mu_true = f.link.inverse(y_true) np.random.seed(8765993) #y_obs = np.asarray([stats.poisson.rvs(p) for p in mu], float) y_obs = self.rvs(mu_true, scale=scale, size=nobs) #this should work m = GAM(y_obs, x, family=f) #TODO: y_obs is twice __init__ and fit m.fit(y_obs, maxiter=100) res_gam = m.results self.res_gam = res_gam #attached for debugging self.mod_gam = m #attached for debugging res_glm = GLM(y_obs, exog, family=f).fit() #Note: there still are some naming inconsistencies self.res1 = res1 = Dummy() #for gam model #res2 = Dummy() #for benchmark self.res2 = res2 = res_glm #reuse existing glm results, will add additional #eta in GLM terminology res2.y_pred = res_glm.model.predict(res_glm.params, exog, linear=True) res1.y_pred = res_gam.predict(x) res1.y_predshort = res_gam.predict(x[:10]) #, linear=True) #mu res2.mu_pred = res_glm.model.predict(res_glm.params, exog, linear=False) res1.mu_pred = res_gam.mu #parameters slopes = [i for ss in m.smoothers for i in ss.params[1:]] const = res_gam.alpha + sum([ss.params[1] for ss in m.smoothers]) res1.params = np.array([const] + slopes)
def test_score_test_OLS(): # nicer example than Longley from statsmodels.regression.linear_model import OLS np.random.seed(5) nobs = 100 sige = 0.5 x = np.random.uniform(0, 1, size=(nobs, 5)) x[:, 0] = 1 beta = 1. / np.arange(1., x.shape[1] + 1) y = x.dot(beta) + sige * np.random.randn(nobs) res_ols = OLS(y, x).fit() res_olsc = OLS(y, x[:, :-2]).fit() co = res_ols.compare_lm_test(res_olsc, demean=False) res_glm = GLM(y, x[:, :-2], family=sm.families.Gaussian()).fit() co2 = res_glm.model.score_test(res_glm.params, exog_extra=x[:, -2:]) # difference in df_resid versus nobs in scale see #1786 assert_allclose(co[0] * 97 / 100., co2[0], rtol=1e-13)
def setup_class(cls): fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs # This is really close when corr_fact = (wsum - 1.) / wsum, but to # avoid having loosen precision of the assert_allclose, I'm doing this # manually. Its *possible* lowering the IRLS convergence criterion # in stata and here will make this less sketchy. cls.corr_fact = np.sqrt((wsum - 1.) / wsum) * 0.98518473599905609 cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=aweights ).fit(cov_type='HC0') #, cov_kwds={'use_correction':False}) # compare with discrete, start close to save time # modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) cls.res2 = res_stata.results_poisson_aweight_hc1
def test_basic(self): res1 = self.res1 res2 = self.res2 assert_allclose(self.eff, res2.TE, rtol=1e-13) assert_allclose(self.var_eff, res2.seTE**2, rtol=1e-13) assert_allclose(res1.mean_effect_fe, res2.TE_fixed, rtol=1e-13) # R meta does not adjust sd FE for HKSJ assert_allclose(res1.sd_eff_w_fe, res2.seTE_fixed, rtol=1e-13) assert_allclose(res1.q, res2.Q, rtol=1e-13) assert_allclose(res1.tau2, res2.tau2, rtol=1e-10) assert_allclose(res1.mean_effect_re, res2.TE_random, rtol=1e-13) assert_allclose(res1.sd_eff_w_re_hksj, res2.seTE_random, rtol=1e-13) th = res1.test_homogeneity() q, pv = th df = th.df assert_allclose(q, res2.Q, rtol=1e-13) assert_allclose(pv, res2.pval_Q, rtol=1e-13) assert_allclose(df, res2.df_Q, rtol=1e-13) assert_allclose(res1.i2, res2.I2, rtol=1e-13) assert_allclose(res1.h2, res2.H**2, rtol=1e-13) ci = res1.conf_int(use_t=True) # fe, re, fe_wls, re_wls # R meta does not adjust FE for HKSJ, still uses normal dist # assert_allclose(ci[0][0], res2.lower_fixed, atol=1e-10) # assert_allclose(ci[0][1], res2.upper_fixed, atol=1e-10) assert_allclose(ci[3][0], res2.lower_random, rtol=1e-13) assert_allclose(ci[3][1], res2.upper_random, rtol=1e-10) ci = res1.conf_int(use_t=False) # fe, re, fe_wls, re_wls assert_allclose(ci[0][0], res2.lower_fixed, rtol=1e-13) assert_allclose(ci[0][1], res2.upper_fixed, rtol=1e-13) weights = 1 / self.var_eff mod_glm = GLM(self.eff, np.ones(len(self.eff)), var_weights=weights) res_glm = mod_glm.fit() assert_allclose(res_glm.params, res2.TE_fixed, rtol=1e-13) weights = 1 / (self.var_eff + res1.tau2) mod_glm = GLM(self.eff, np.ones(len(self.eff)), var_weights=weights) res_glm = mod_glm.fit() assert_allclose(res_glm.params, res2.TE_random, rtol=1e-13)
def setup_class(cls): from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families from statsmodels.base._constraints import fit_constrained cls.res2 = results.results_exposure_constraint cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical # example with offset formula = 'deaths ~ smokes + C(agecat)' mod = GLM.from_formula(formula, data=data, family=families.Poisson(), offset=np.log(data['pyears'].values)) constr = 'C(agecat)[T.4] = C(agecat)[T.5]' lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) cls.res1 = fit_constrained(mod, lc.coefs, lc.constants) cls.constraints = lc cls.res1m = mod.fit_constrained(constr)._results
def setup_class(cls): fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs gid = np.arange(1, 17 + 1) // 2 n_groups = len(np.unique(gid)) # no wnobs yet in sandwich covariance calcualtion cls.corr_fact = 1 / np.sqrt(n_groups / (n_groups - 1)) #np.sqrt((wsum - 1.) / wsum) cov_kwds = {'groups': gid, 'use_correction':False} cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), freq_weights=fweights ).fit(cov_type='cluster', cov_kwds=cov_kwds) # compare with discrete, start close to save time #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) cls.res2 = res_stata.results_poisson_fweight_clu1
def __init__(self): ''' Tests the Inverse Gaussian family in GLM. Notes ----- Used the rndivgx.ado file provided by Hardin and Hilbe to generate the data. Results are read from model_results, which were obtained by running R_ig.s ''' # Test Precisions self.decimal_aic_R = DECIMAL_0 self.decimal_loglike = DECIMAL_0 from results.results_glm import InvGauss res2 = InvGauss() res1 = GLM(res2.endog, res2.exog, \ family=sm.families.InverseGaussian()).fit() self.res1 = res1 self.res2 = res2
def compute_chi2_null_test(model_results, data, dep_var, max_iter, l2_weight): """ Compute difference from null model using deviance: P(null) - P(model) ~ chi_2 """ null_formula = '%s ~ 1' % (dep_var) null_model = GLM.from_formula(null_formula, data, family=Binomial(link=logit())) null_model_results = null_model.fit_regularized(maxiter=max_iter, method='elastic_net', alpha=l2_weight, L1_wt=0.0) model_loglike = model_results.model.loglike(model_results.params) null_model_loglike = null_model_results.model.loglike( null_model_results.params) llr = -2 * (null_model_loglike - model_loglike) model_df = model_results.model.df_model p_val = chi2.sf(llr, model_df) return llr, model_df, p_val
def test_est_regularized_debiased(): # tests that the shape of all the intermediate steps # remains correct for regularized debiased estimation, # does this for OLS and GLM np.random.seed(435265) X = np.random.normal(size=(50, 3)) y = np.random.randint(0, 2, size=50) beta = np.random.normal(size=3) mod = OLS(y, X) res = _est_regularized_debiased(mod, 0, 2, fit_kwds={"alpha": 0.5}) bhat = res[0] grad = res[1] ghat_l = res[2] that_l = res[3] assert_(isinstance(res, tuple)) assert_equal(bhat.shape, beta.shape) assert_equal(grad.shape, beta.shape) assert_(isinstance(ghat_l, list)) assert_(isinstance(that_l, list)) assert_equal(len(ghat_l), len(that_l)) assert_equal(ghat_l[0].shape, (2, )) assert_(isinstance(that_l[0], float)) mod = GLM(y, X, family=Binomial()) res = _est_regularized_debiased(mod, 0, 2, fit_kwds={"alpha": 0.5}) bhat = res[0] grad = res[1] ghat_l = res[2] that_l = res[3] assert_(isinstance(res, tuple)) assert_equal(bhat.shape, beta.shape) assert_equal(grad.shape, beta.shape) assert_(isinstance(ghat_l, list)) assert_(isinstance(that_l, list)) assert_equal(len(ghat_l), len(that_l)) assert_equal(ghat_l[0].shape, (2, )) assert_(isinstance(that_l[0], float))
def setup_class(cls): vs = Independence() family = families.Poisson() np.random.seed(987126) Y = np.exp(1 + np.random.normal(size=100)) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def __init__(self, model, taylor): self.model = model self.stats = model.dm_statistics if hasattr(model, "dm_statistics") else None self.dm = pd.DataFrame({ lev: t.data[:, i] for t in model.fixed_terms.values() for i, lev in enumerate(t.levels) }) self.priors = {} missing = "drop" if self.model.dropna else "none" self.mle = GLM( endog=self.model.y.data, exog=self.dm, family=self.model.family.smfamily(), missing=missing, ).fit() self.taylor = taylor with open(join(dirname(__file__), "config", "derivs.txt"), "r") as file: self.deriv = [next(file).strip("\n") for x in range(taylor + 1)]
def _fit_mle(self): """Fits MLE of the common part of the model. This used to be called in the class instantiation, but there is no need to fit the GLM when there are no automatic priors. So this method is only called when needed. """ missing = "drop" if self.model.dropna else "none" try: self.mle = GLM( endog=self.model.response.data, exog=self.dm, family=self.model.family.smfamily(self.model.family.smlink), missing=missing, ).fit() except PerfectSeparationError as error: msg = "Perfect separation detected, automatic priors are not available. " msg += "Please indicate priors manually." raise PerfectSeparationError(msg) from error except: print("Unexpected error:", sys.exc_info()[0]) raise
def setup_class(cls): vs = Independence() family = families.Gaussian() np.random.seed(987126) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(np.arange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = md.fit() cls.result2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
def test_warnings_raised(): if sys.version_info < (3, 4): raise SkipTest weights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights weights = np.array(weights) gid = np.arange(1, 17 + 1) // 2 cov_kwds = {'groups': gid, 'use_correction': False} with warnings.catch_warnings(record=True) as w: res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), freq_weights=weights ).fit(cov_type='cluster', cov_kwds=cov_kwds) res1.summary() assert len(w) >= 1 with warnings.catch_warnings(record=True) as w: res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=weights ).fit(cov_type='cluster', cov_kwds=cov_kwds) res1.summary() assert len(w) >= 1
def fit_scores(self, balance=True, nmodels=None, k=3): if not self.formula: # use all columns in the model (untransformed) self.formula = '{} ~ {}'.format(self.yvar, '+'.join(self.xvars)) if self.stepwise: print "Optimizing Forumla via forward stepwise selection..." # use all columns + trasnformed columns in model self.formula, self.swdata = \ self.forward_stepwise(self.balanced_sample(), self.yvar, k=k) if balance: if nmodels is None: # fit mutliple models based on imbalance severity (rounded up to nearest tenth) minor, major = [ self.data[self.data[self.yvar] == i] for i in (self.minority, self.majority) ] nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10) self.nmodels = nmodels for i in range(nmodels): progress( i + 1, nmodels, prestr="Fitting {} Models on Balanced Samples...".format( nmodels)) # sample from majority to create balance dataset df = self.balanced_sample() y_samp, X_samp = patsy.dmatrices(self.formula, data=df, return_type='dataframe') glm = GLM(y_samp, X_samp, family=sm.families.Binomial()) res = glm.fit() self.model_accurracy.append( self._scores_to_accuracy(res, X_samp, y_samp)) self.models.append(res) print "\nAverage Accuracy:", "{}%".\ format(round(np.mean(self.model_accurracy) * 100, 2)) else: # ignore any imbalance and fit one model self.nmodels = 1 print '\nFitting 1 (Unbalanced) Model...' glm = GLM(self.y, self.X, family=sm.families.Binomial()) res = glm.fit() self.model_accurracy.append( self._scores_to_accuracy(res, self.X, self.y)) self.models.append(res) print "Accuracy", round(np.mean(self.model_accurracy[0]) * 100, 2)
def __init__(self): ''' Test Negative Binomial family with canonical log link ''' # Test Precision self.decimal_resid = DECIMAL_1 self.decimal_params = DECIMAL_3 self.decimal_resids = -1 # 1 % mismatch at 0 self.decimal_fittedvalues = DECIMAL_1 from statsmodels.datasets.committee import load self.data = load() self.data.exog[:,2] = np.log(self.data.exog[:,2]) interaction = self.data.exog[:,2]*self.data.exog[:,1] self.data.exog = np.column_stack((self.data.exog,interaction)) self.data.exog = add_constant(self.data.exog) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.NegativeBinomial()).fit() from results.results_glm import Committee res2 = Committee() res2.aic_R += 2 # They don't count a degree of freedom for the scale self.res2 = res2
def setup_class(cls): # adjusted for Gamma, not in test_gee.py vs = Independence() family = families.Gamma(link=links.log) np.random.seed(987126) #Y = np.random.normal(size=100)**2 Y = np.exp(0.1 + np.random.normal(size=100)) # log-normal X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_calc_wdesign_mat(): # seperately tests that _calc_wdesign_mat # returns sensible results # # regression test np.random.seed(435265) X = np.random.normal(size=(3, 3)) y = np.random.randint(0, 2, size=3) beta = np.random.normal(size=3) mod = OLS(y, X) dmat = _calc_wdesign_mat(mod, beta, {}) assert_allclose(dmat, np.array([[1.306314, -0.024897, 1.326498], [-0.539219, -0.483028, -0.703503], [-3.327987, 0.524541, -0.139761]]), atol=1e-6, rtol=0) mod = GLM(y, X, family=Binomial()) dmat = _calc_wdesign_mat(mod, beta, {}) assert_allclose(dmat, np.array([[0.408616, -0.007788, 0.41493], [-0.263292, -0.235854, -0.343509], [-0.11241, 0.017718, -0.004721]]), atol=1e-6, rtol=0)
def setupClass(cls): self = cls # alias fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs self.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=aweights).fit() # compare with discrete, start close to save time modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) # Need to copy to avoid inplace adjustment from copy import copy self.res2 = copy(res_stata.results_poisson_aweight_nonrobust) self.res2.resids = self.res2.resids.copy() # Need to adjust resids for pearson and deviance to add weights self.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])
def __init__(self): from results.results_glm import Lbw self.res2 = Lbw() self.res1 = GLM(self.res2.endog, self.res2.exog, family=sm.families.Binomial()).fit()