def __init__(self): ''' Test Negative Binomial family with canonical log link ''' # Test Precision self.decimal_resid = DECIMAL_1 self.decimal_params = DECIMAL_3 self.decimal_resids = -1 # 1 % mismatch at 0 self.decimal_fittedvalues = DECIMAL_1 from statsmodels.datasets.committee import load self.data = load() self.data.exog[:, 2] = np.log(self.data.exog[:, 2]) interaction = self.data.exog[:, 2] * self.data.exog[:, 1] self.data.exog = np.column_stack((self.data.exog, interaction)) self.data.exog = add_constant(self.data.exog, prepend=False) self.res1 = GLM(self.data.endog, self.data.exog, family=sm.families.NegativeBinomial()).fit() from .results.results_glm import Committee res2 = Committee() res2.aic_R += 2 # They don't count a degree of freedom for the scale self.res2 = res2
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = res1 = mod.fit(cov_type='cluster', cov_kwds=dict(groups=group, use_correction=True, df_correction=True), #TODO has no effect use_t=False, #True, ) # The model results, t_test, ... should also work without # normalized_cov_params, see #2209 # Note: we cannot set on the wrapper res1, we need res1._results cls.res1._results.normalized_cov_params = None cls.bse_rob = cls.res1.bse nobs, k_vars = mod.exog.shape k_params = len(cls.res1.params) #n_groups = len(np.unique(group)) corr_fact = (nobs-1.) / float(nobs - k_params) # for bse we need sqrt of correction factor cls.corr_fact = np.sqrt(corr_fact)
def __init__(self): # Test Precisions self.decimal_bic = DECIMAL_1 self.decimal_aic_R = DECIMAL_1 self.decimal_aic_Stata = DECIMAL_3 self.decimal_loglike = DECIMAL_1 self.decimal_resids = DECIMAL_3 nobs = 100 x = np.arange(nobs) np.random.seed(54321) y = 1.0 + 2.0 * x + x**2 + 0.1 * np.random.randn(nobs) self.X = np.c_[np.ones((nobs, 1)), x, x**2] self.y_inv = (1. + .02 * x + .001 * x**2)**-1 + .001 * np.random.randn(nobs) InverseLink_Model = GLM(self.y_inv, self.X, family=sm.families.Gaussian( sm.families.links.inverse_power)) InverseLink_Res = InverseLink_Model.fit() self.res1 = InverseLink_Res from .results.results_glm import GaussianInverse self.res2 = GaussianInverse()
def setup_class(cls): fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs gid = np.arange(1, 17 + 1) // 2 n_groups = len(np.unique(gid)) # no wnobs yet in sandwich covariance calcualtion cls.corr_fact = 1 / np.sqrt(n_groups / (n_groups - 1)) # np.sqrt((wsum - 1.) / wsum) cov_kwds = {'groups': gid, 'use_correction': False} with pytest.warns(None): mod = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), freq_weights=fweights) cls.res1 = mod.fit(cov_type='cluster', cov_kwds=cov_kwds) cls.res2 = res_stata.results_poisson_fweight_clu1
def _initialize(cls): y, x = cls.y, cls.x # adding 10 to avoid strict rtol at predicted values close to zero y = y + 10 cov_type = 'HC0' modp = GLM(y, x[:, :cls.k_nonzero], family=family.Gaussian()) cls.res2 = modp.fit(cov_type=cov_type, method='bfgs', maxiter=100, disp=0) mod = GLMPenalized(y, x, family=family.Gaussian(), penal=cls.penalty) mod.pen_weight *= 1.5 # same as discrete Poisson mod.penal.tau = 0.05 cls.res1 = mod.fit(cov_type=cov_type, method='bfgs', maxiter=100, disp=0) cls.exog_index = slice(None, cls.k_nonzero, None) cls.atol = 5e-6 cls.rtol = 1e-6
def setup_class(cls): cls.idx = slice(None) # params sequence same as Stata #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0) cls.res2 = reslogit.results_constraint2_robust mod1 = GLM(spector_data.endog, spector_data.exog, family=families.Binomial()) # not used to match Stata for HC # nobs, k_params = mod1.exog.shape # k_params -= 1 # one constraint cov_type = 'HC0' cov_kwds = {'scaling_factor': 32/31} # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)} constr = 'x1 - x3 = 0' cls.res1m = mod1.fit_constrained(constr, cov_type=cov_type, cov_kwds=cov_kwds, atol=1e-10) R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10, 'cov_type': cov_type, 'cov_kwds': cov_kwds}) cls.constraints_rq = (R, q)
def _get_intercept_stats(self, add_slopes=True): # start with mean and variance of Y on the link scale mod = GLM( endog=self.model.response.data, exog=np.repeat(1, len(self.model.response.data)), family=self.model.family.smfamily(self.model.family.smlink), missing="drop" if self.model.dropna else "none", ).fit() mu = mod.params # multiply SE by sqrt(N) to turn it into (approx.) sigma(Y) on link scale sigma = (mod.cov_params()[0] * len(mod.mu))**0.5 # modify mu and sigma based on means and sigmas of slope priors. if len(self.model.common_terms) > 1 and add_slopes: means = np.array([x["mu"] for x in self.priors.values()]) sigmas = np.array([x["sigma"] for x in self.priors.values()]) # add to intercept prior index = list(self.priors.keys()) mu -= np.dot(means, self.stats["mean_x"][index]) sigma = (sigma**2 + np.dot(sigmas**2, self.stats["mean_x"][index]**2))**0.5 return mu, sigma
def setupClass(cls): self = cls # alias fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs # This is really close when corr_fact = (wsum - 1.) / wsum, but to # avoid having loosen precision of the assert_allclose, I'm doing this # manually. Its *possible* lowering the IRLS convergence criterion # in stata and here will make this less sketchy. self.corr_fact = np.sqrt((wsum - 1.) / wsum) * 0.98518473599905609 self.res1 = GLM( cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=aweights).fit( cov_type='HC0') #, cov_kwds={'use_correction':False}) # compare with discrete, start close to save time # modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) self.res2 = res_stata.results_poisson_aweight_hc1
def test_calc_wdesign_mat(): # seperately tests that _calc_wdesign_mat # returns sensible results # # regression test np.random.seed(435265) X = np.random.normal(size=(3, 3)) y = np.random.randint(0, 2, size=3) beta = np.random.normal(size=3) mod = OLS(y, X) dmat = _calc_wdesign_mat(mod, beta, {}) assert_allclose(dmat, np.array([[1.306314, -0.024897, 1.326498], [-0.539219, -0.483028, -0.703503], [-3.327987, 0.524541, -0.139761]]), atol=1e-6, rtol=0) mod = GLM(y, X, family=Binomial()) dmat = _calc_wdesign_mat(mod, beta, {}) assert_allclose(dmat, np.array([[0.408616, -0.007788, 0.41493], [-0.263292, -0.235854, -0.343509], [-0.11241, 0.017718, -0.004721]]), atol=1e-6, rtol=0)
def test_influence_glm_bernoulli(): # example uses Finney's data and is used in Pregibon 1981 df = data_bin results_sas = np.asarray(results_sas_df) res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit(attach_wls=True, atol=1e-10) infl = res.get_influence(observed=False) k_vars = 3 assert_allclose(infl.dfbetas, results_sas[:, 5:8], atol=1e-4) assert_allclose(infl.d_params, results_sas[:, 5:8] * res.bse.values, atol=1e-4) assert_allclose(infl.cooks_distance[0] * k_vars, results_sas[:, 8], atol=6e-5) assert_allclose(infl.hat_matrix_diag, results_sas[:, 4], atol=6e-5) c_bar = infl.cooks_distance[0] * 3 * (1 - infl.hat_matrix_diag) assert_allclose(c_bar, results_sas[:, 9], atol=6e-5)
def setupClass(cls): self = cls # alias fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3] # faking aweights by using normalized freq_weights fweights = np.array(fweights) wsum = fweights.sum() nobs = len(cpunish_data.endog) aweights = fweights / wsum * nobs self.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson(), var_weights=aweights).fit() # compare with discrete, start close to save time modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog) # Need to copy to avoid inplace adjustment from copy import copy self.res2 = copy(res_stata.results_poisson_aweight_nonrobust) self.res2.resids = self.res2.resids.copy() # Need to adjust resids for pearson and deviance to add weights self.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])
def setup_class(cls): cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog, family=sm.families.Poisson()).fit() cls.res2 = res_stata.results_poisson_none_nonrobust
plt.rc("figure", figsize=(16, 8)) plt.rc("font", size=14) import statsmodels.stats.tests.test_influence test_module = statsmodels.stats.tests.test_influence.__file__ cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module)) file_name = "binary_constrict.csv" file_path = os.path.join(cur_dir, "results", file_name) df = pd.read_csv(file_path, index_col=0) res = GLM( df["constrict"], df[["const", "log_rate", "log_volumne"]], family=families.Binomial(), ).fit(attach_wls=True, atol=1e-10) print(res.summary()) # ## get the influence measures # # GLMResults has a `get_influence` method similar to OLSResults, that # returns and instance of the GLMInfluence class. This class has methods and # (cached) attributes to inspect influence and outlier measures. # # This measures are based on a one-step approximation to the the results # for deleting one observation. One-step approximations are usually accurate # for small changes but underestimate the magnitude of large changes. Event # though large changes are underestimated, they still show clearly the # effect of influential observations
def fit_scores(self, balance=True, nmodels=None): """ Fits logistic regression model(s) used for generating propensity scores Parameters ---------- balance : bool Should balanced datasets be used? (n_control == n_test) nmodels : int How many models should be fit? Score becomes the average of the <nmodels> models if nmodels > 1 Returns ------- None """ # reset models if refitting if len(self.models) > 0: self.models = [] if len(self.model_accuracy) > 0: self.model_accuracy = [] if not self.formula: # use all columns in the model self.formula = "{} ~ {}".format(self.yvar, "+".join(self.xvars)) if balance: if nmodels is None: # fit multiple models based on imbalance severity (rounded up to nearest tenth) minor, major = [ self.data[self.data[self.yvar] == i] for i in (self.minority, self.majority) ] nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10) self.nmodels = nmodels i = 0 errors = 0 while i < nmodels and errors < 5: uf_progress(i + 1, nmodels, prestr="Fitting Models on Balanced Samples") # sample from majority to create balance dataset df = self.balanced_sample() df = pd.concat( [ uf_drop_static_cols(df[df[self.yvar] == 1], yvar=self.yvar), uf_drop_static_cols(df[df[self.yvar] == 0], yvar=self.yvar), ], sort=True, ) y_samp, X_samp = patsy.dmatrices(self.formula, data=df, return_type="dataframe") X_samp.drop(self.yvar, axis=1, errors="ignore", inplace=True) # print("y_samp:",y_samp) # print("X_samp:",X_samp) glm = GLM(y_samp, X_samp, family=sm.families.Binomial()) try: res = glm.fit() # print("GLM", res.summary()) self.model_accuracy.append( self._scores_to_accuracy(res, X_samp, y_samp)) self.models.append(res) i = i + 1 except Exception as e: errors = ( errors + 1 ) # to avoid infinite loop for misspecified matrix print("Error: {}".format(e)) print( "\nAverage Accuracy:", "{}%".format(round(np.mean(self.model_accuracy) * 100, 2)), ) else: # ignore any imbalance and fit one model print("Fitting 1 (Unbalanced) Model...") # print("self.y", self.y) # print("self.X", self.X) glm = GLM(self.y, self.X, family=sm.families.Binomial()) res = glm.fit() self.model_accuracy.append( self._scores_to_accuracy(res, self.X, self.y)) self.models.append(res) print("\nAccuracy", round(np.mean(self.model_accuracy[0]) * 100, 2))
def test_glm(self): # prelimnimary, getting started with basic test for GLM.get_prediction from statsmodels.genmod.generalized_linear_model import GLM res_wls = self.res_wls mod_wls = res_wls.model y, X, wi = mod_wls.endog, mod_wls.exog, mod_wls.weights w_sqrt = np.sqrt(wi) # notation wi is weights, `w` is var mod_glm = GLM(y * w_sqrt, X * w_sqrt[:, None]) # compare using t distribution res_glm = mod_glm.fit(use_t=True) pred_glm = res_glm.get_prediction() sf_glm = pred_glm.summary_frame() pred_res_wls = res_wls.get_prediction() sf_wls = pred_res_wls.summary_frame() n_compare = 30 # in glm with predict wendog assert_allclose(sf_glm.values[:n_compare], sf_wls.values[:n_compare, :4]) # compare using normal distribution res_glm = mod_glm.fit() # default use_t=False pred_glm = res_glm.get_prediction() sf_glm = pred_glm.summary_frame() res_wls = mod_wls.fit(use_t=False) pred_res_wls = res_wls.get_prediction() sf_wls = pred_res_wls.summary_frame() assert_allclose(sf_glm.values[:n_compare], sf_wls.values[:n_compare, :4]) # function for parameter transformation # should be separate test method from statsmodels.genmod._prediction import params_transform_univariate rates = params_transform_univariate(res_glm.params, res_glm.cov_params()) rates2 = np.column_stack( (np.exp(res_glm.params), res_glm.bse * np.exp(res_glm.params), np.exp(res_glm.conf_int()))) assert_allclose(rates.summary_frame().values, rates2, rtol=1e-13) from statsmodels.genmod.families import links # with identity transform pt = params_transform_univariate(res_glm.params, res_glm.cov_params(), link=links.identity()) assert_allclose(pt.tvalues, res_glm.tvalues, rtol=1e-13) assert_allclose(pt.se_mean, res_glm.bse, rtol=1e-13) ptt = pt.t_test() assert_allclose(ptt[0], res_glm.tvalues, rtol=1e-13) assert_allclose(ptt[1], res_glm.pvalues, rtol=1e-13) # prediction with exog and no weights does not error res_glm = mod_glm.fit() pred_glm = res_glm.get_prediction(X) # check that list works, issue 4437 x = res_glm.model.exog.mean(0) pred_res3 = res_glm.get_prediction(x) ci3 = pred_res3.conf_int() pred_res3b = res_glm.get_prediction(x.tolist()) ci3b = pred_res3b.conf_int() assert_allclose(pred_res3b.se_mean, pred_res3.se_mean, rtol=1e-13) assert_allclose(ci3b, ci3, rtol=1e-13) res_df = pred_res3b.summary_frame() assert_equal(res_df.index.values, [0]) x = res_glm.model.exog[-2:] pred_res3 = res_glm.get_prediction(x) ci3 = pred_res3.conf_int() pred_res3b = res_glm.get_prediction(x.tolist()) ci3b = pred_res3b.conf_int() assert_allclose(pred_res3b.se_mean, pred_res3.se_mean, rtol=1e-13) assert_allclose(ci3b, ci3, rtol=1e-13) res_df = pred_res3b.summary_frame() assert_equal(res_df.index.values, [0, 1])
plt.legend(loc='upper left') plt.title('gam.GAM Poisson') counter = 2 for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:, 0], x[:, 1]]): sortidx = np.argsort(xx) #plt.figure() plt.subplot(2, 2, counter) plt.plot(xx[sortidx], p[sortidx], 'k.', alpha=0.5) plt.plot(xx[sortidx], yp[sortidx], 'b.', label='true') plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='GAM') plt.legend(loc='upper left') plt.title('gam.GAM Poisson ' + ii) counter += 1 res = GLM(p, exog_reduced, family=f).fit() #plot component, compared to true component x1 = x[:, 0] x2 = x[:, 1] f1 = exog[:, :order + 1].sum(1) - 1 #take out constant f2 = exog[:, order + 1:].sum(1) - 1 plt.figure() #Note: need to correct for constant which is indeterminatedly distributed #plt.plot(x1, m.smoothers[0](x1)-m.smoothers[0].params[0]+1, 'r') #better would be subtract f(0) m.smoothers[0](np.array([0])) plt.plot(x1, f1, linewidth=2) plt.plot(x1, m.smoothers[0](x1) - m.smoothers[0].params[0], 'r') plt.figure() plt.plot(x2, f2, linewidth=2)
def init(cls): cls.res2 = cls.mod2.fit() mod = GLM(cls.endog, cls.exog) mod.exog_names[:] = ['const', 'x1', 'x2', 'x3', 'x4'] cls.res1 = mod.fit_constrained('x1=0.5')
results_oos = m.predict(points = X_test[:,0:2], P= X_test[:,2:],exog_scale = m.exog_scale, exog_resid = m.exog_resid) rmse_oos = np.sqrt(np.mean((y_test-results_oos.predictions)**2)) lik_oos = np.sum(poisson(x = y_test, mu = np.clip(results_oos.predictions,a_min=0.0001,a_max=None))) row = pd.Series({'RMSE_IS':rmse_is,'LIK_IS':lik_is,'RMSE_OOS':rmse_oos,'LIK_OOS':lik_oos},name='GWR (count)') results =results.append(row) # ============================================================================= # Linear Kriging with count features # ============================================================================= X_train, X_test, _, y_train, y_test, _, _, _,_,_ = data_pipeline(feature_engineering = True, feature_type = 'count') m = GLM(endog=y_train.reshape((-1,)),exog=X_train[:,2:], family=Poisson(link= sm.genmod.families.links.log)) results = m.fit() res = y_train.reshape((-1,)) - results.fittedvalues # kernel = ConstantKernel()*RBF(10, (1e-2, 1e2))+WhiteKernel() gp = GaussianProcessRegressor(kernel = kernel, n_restarts_optimizer=1) gp.fit(X_train[:,0:2], res) # pred_oos = m.predict(exog = X_test[:,2:], params = results.params) + gp.predict(X_test[:,0:2]) pred_is = m.predict(exog = X_train[:,2:], params = results.params) + gp.predict(X_train[:,0:2]) # rmse_oos = np.sqrt(np.mean((pred_oos-y_test.reshape((-1,)))**2)) lik_oos = np.sum(poisson(x = y_test.reshape((-1,)), mu = np.clip(pred_oos,a_min=0.0001,a_max=None)))
def _scale_group_specific(self, term): # these default priors are only defined for HalfNormal priors if term.prior.args["sigma"].name != "HalfNormal": return sigma_corr = term.prior.scale # recreate the corresponding common effect data fix_data = term.data.sum(axis=1) # handle intercepts and cell means if term.constant: _, sigma = self._get_intercept_stats() sigma *= sigma_corr # handle slopes else: exists = [ x for x in self.dm.columns # pylint: disable=not-an-iterable if np.array_equal(fix_data, self.dm[x].values) ] # handle case where there IS a corresponding common effect if exists and exists[0] in self.priors.keys(): sigma = self.priors[exists[0]]["sigma"] # handle case where there IS NOT a corresponding common effect else: # the usual case: add the group specific effect data as a common effect # in the design matrix if not exists: fix_dataframe = pd.DataFrame(fix_data) # things break if column names are integers (the default) fix_dataframe.rename( columns={ c: "_" + str(c) for c in fix_dataframe.columns # pylint: disable=not-an-iterable }, inplace=True, ) exog = self.dm.join(fix_dataframe) # this handles the corner case where there technically is the # corresponding common effect, but the parameterization differs # between the common- and group-specific-effect specification. usually # this means the common effects use cell-means coding but the # group specific effects use k-1 coding else: group = term.name.split("|")[1] exog = self.model.group_specific_terms.values() exog = [ v.data.sum(1) for v in exog if v.name.split("|")[-1] == group ] index = ["_" + str(i) for i in range(len(exog))] exog = pd.DataFrame(exog, index=index).T # this will replace self.mle (which is missing predictors) missing = "drop" if self.model.dropna else "none" full_mod = GLM( endog=self.model.response.data, exog=exog, family=self.model.family.smfamily(), missing=missing, ).fit() sigma = self._get_slope_stats(exog=exog, predictor=fix_data, full_mod=full_mod, sigma_corr=sigma_corr) # set the prior sigma. term.prior.args["sigma"].update(sigma=np.squeeze(np.atleast_1d(sigma)))
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30): """ Calculate local FDR values for a list of Z-scores. Parameters ---------- zscores : array-like A vector of Z-scores null_proportion : float The assumed proportion of true null hypotheses null_pdf : function mapping reals to positive reals The density of null Z-scores; if None, use standard normal deg : integer The maximum exponent in the polynomial expansion of the density of non-null Z-scores nbins : integer The number of bins for estimating the marginal density of Z-scores. Returns ------- fdr : array-like A vector of FDR values References ---------- B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups Model. Statistical Science 23:1, 1-22. Examples -------- Basic use (the null Z-scores are taken to be standard normal): >>> from statsmodels.stats.multitest import local_fdr >>> import numpy as np >>> zscores = np.random.randn(30) >>> fdr = local_fdr(zscores) Use a Gaussian null distribution estimated from the data: >>> null = EmpiricalNull(zscores) >>> fdr = local_fdr(zscores, null_pdf=null.pdf) """ from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.generalized_linear_model import families from statsmodels.regression.linear_model import OLS # Bins for Poisson modeling of the marginal Z-score density minz = min(zscores) maxz = max(zscores) bins = np.linspace(minz, maxz, nbins) # Bin counts zhist = np.histogram(zscores, bins)[0] # Bin centers zbins = (bins[:-1] + bins[1:]) / 2 # The design matrix at bin centers dmat = np.vander(zbins, deg + 1) # Use this to get starting values for Poisson regression md = OLS(np.log(1 + zhist), dmat).fit() # Poisson regression md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params) # The design matrix for all Z-scores dmat_full = np.vander(zscores, deg + 1) # The height of the estimated marginal density of Z-scores, # evaluated at every observed Z-score. fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) # The null density. if null_pdf is None: f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) else: f0 = null_pdf(zscores) # The local FDR values fdr = null_proportion * f0 / fz fdr = np.clip(fdr, 0, 1) return fdr
def fit(self, flow_df, relevance_column=constants.RELEVANCE): """ Fit the gravity model parameters to the flows in file `filename`. Can fit globally or singly constrained gravity models using a Generalized Linear Model (GLM) with a Poisson regression. Parameters ---------- flow_df : FlowDataFrame where the flows are stored and with info about the spatial tessellation. In addition to the default columns, the spatial tessellation must contain the column "relevance": float, number of opportunities at the location (e.g., population or total number of visits). Returns ------- X : list of independent variables (features) used in the GLM fit. y : list of dependent varibles (flows) used in the GLM fit. poisson_results : statsmodels.genmod.generalized_linear_model.GLMResultsWrapper statsmodels object with information on the fit's quality and predictions. References ---------- .. [1] Agresti, Alan. "Categorical data analysis." Vol. 482. John Wiley & Sons, 2003. .. [2] Flowerdew, Robin, and Murray Aitkin. "A method of fitting the gravity model based on the Poisson distribution." Journal of regional science 22.2 (1982): 191-202. """ self.lats_lngs = flow_df.tessellation.geometry.apply(utils.get_geom_centroid, args=[True]).values self.weights = flow_df.tessellation[relevance_column].fillna(0).values self.tileid2index = dict( [(tileid, i) for i, tileid in enumerate(flow_df.tessellation[constants.TILE_ID].values)]) self.X, self.y = [], [] # independent (X) and dependent (y) variables # flow_df.progress_apply(lambda flow_example: self._update_training_set(flow_example), # axis=1) flow_df.apply(lambda flow_example: self._update_training_set(flow_example), axis=1) # Perform GLM fit poisson_model = GLM(self.y, self.X, family=sm.genmod.families.family.Poisson(link=sm.genmod.families.links.log)) poisson_results = poisson_model.fit() # Set best fit parameters if self._gravity_type == 'globally constrained': self._origin_exp = poisson_results.params[1] self._destination_exp = poisson_results.params[2] self._deterrence_func_args = [poisson_results.params[3]] else: # if singly constrained self._origin_exp = 1. self._destination_exp = poisson_results.params[-2] self._deterrence_func_args = [poisson_results.params[-1]] # we delete the instance variables we do not need anymore del self.X del self.y
from numpy.testing import assert_allclose assert_allclose(pred_res2.se_obs, prstd, rtol=1e-13) assert_allclose(ci2, np.column_stack((iv_l, iv_u)), rtol=1e-13) print pred_res2.summary_frame().head() res_wls_n = mod_wls.fit(use_t=False) pred_wls_n = res_wls_n.get_prediction() print(pred_wls_n.summary_frame().head()) from statsmodels.genmod.generalized_linear_model import GLM w_sqrt = np.sqrt(w) mod_glm = GLM(y / w_sqrt, X / w_sqrt[:, None]) res_glm = mod_glm.fit() pred_glm = res_glm.get_prediction() print(pred_glm.summary_frame().head()) res_glm_t = mod_glm.fit(use_t=True) pred_glm_t = res_glm_t.get_prediction() print(pred_glm_t.summary_frame().head()) rates = params_transform_univariate(res_glm.params, res_glm.cov_params()) print('\nRates exp(params)') print(rates.summary_frame()) rates2 = np.column_stack( (np.exp(res_glm.params), res_glm.bse * np.exp(res_glm.params), np.exp(res_glm.conf_int())))
def mod(y, x): return GLM(y, x, family=families.Binomial())
U_Const = statsmodels.tools.add_constant(U) # In[85]: from statsmodels.discrete.discrete_model import Poisson mpr = Poisson(V, U_Const) res_mpr = mpr.fit() # In[93]: from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod = GLM(V, U_Const, family=families.Poisson()) res = mod.fit() print(res.summary()) # ### La surdispersion # In[95]: #Surdispersion print(res.pearson_chi2 / res.df_resid) # #### On voit bien que le rapport de la pearson chi2_dll /residual deviance est superieur à 1 ,d'ou l'existence de la surdispersion # ### Frequence de nombre de Zero dans les données
def __init__(self): self.setup_class() # why does nose do it properly from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families self.mod = lambda y, x: GLM(y, x, family=families.Binomial()) self.y = self.y_bin
import pandas as pd from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families import statsmodels.stats.tests.test_influence test_module = statsmodels.stats.tests.test_influence.__file__ cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module)) file_name = 'binary_constrict.csv' file_path = os.path.join(cur_dir, 'results', file_name) df = pd.read_csv(file_path, index_col=0) res = GLM( df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit( attach_wls=True, atol=1e-10) print(res.summary()) # ## get the influence measures # # GLMResults has a `get_influence` method similar to OLSResults, that # returns and instance of the GLMInfluence class. This class has methods and # (cached) attributes to inspect influence and outlier measures. # # This measures are based on a one-step approximation to the the results # for deleting one observation. One-step approximations are usually accurate # for small changes but underestimate the magnitude of large changes. Event # though large changes are underestimated, they still show clearly the # effect of influential observations
def _get_slope_stats(self, exog, predictor, sigma_corr, full_mod=None, points=4): """ Parameters ---------- full_mod : statsmodels.genmod.generalized_linear_model.GLM Statsmodels GLM to replace MLE model. For when ``'predictor'`` is not in the common part of the model. points : int Number of points to use for LL approximation. """ if full_mod is None: full_mod = self.mle # figure out which column of exog to drop for the null model keeps = [ i for i, x in enumerate(list(exog.columns)) if not np.array_equal(predictor, exog[x].values.flatten()) ] i = [x for x in range(exog.shape[1]) if x not in keeps][0] # get log-likelihood values from beta=0 to beta=MLE values = np.linspace(0.0, full_mod.params[i], points) # if there are multiple predictors, use statsmodels to optimize the LL if keeps: null = [ GLM(endog=self.model.response.data, exog=exog, family=self.model.family.smfamily()).fit_constrained( str(exog.columns[i]) + "=" + str(val)) for val in values[:-1] ] null = np.append(null, full_mod) log_likelihood = np.array([x.llf for x in null]) # if just a single predictor, use statsmodels to evaluate the LL else: null = [ self.model.family.smfamily().loglike( np.squeeze(self.model.response.data), val * predictor) for val in values[:-1] ] log_likelihood = np.append(null, full_mod.llf) # compute params of quartic approximatino to log-likelihood # c: intercept, d: shift parameter # a: quartic coefficient, b: quadratic coefficient intercept, shift_parameter = log_likelihood[-1], -( full_mod.params[i].item()) X = np.array([(values + shift_parameter)**4, (values + shift_parameter)**2]).T coef_a, coef_b = np.squeeze( np.linalg.multi_dot([ np.linalg.inv(np.dot(X.T, X)), X.T, (log_likelihood[:, None] - intercept) ])) # m, v: mean and variance of beta distribution of correlations # p, q: corresponding shape parameters of beta distribution mean = 0.5 variance = sigma_corr**2 / 4 p = mean * (mean * (1 - mean) / variance - 1) q = (1 - mean) * (mean * (1 - mean) / variance - 1) # function to return central moments of rescaled beta distribution def moment(k): return (2 * p / (p + q))**k * hyp2f1(p, -k, p + q, (p + q) / p) # evaluate the derivatives of beta = f(correlation). # dict 'point' gives points about which to Taylor expand. We want to # expand about the mean (generally 0), but some of the derivatives # do not exist at 0. Evaluating at a point very close to 0 (e.g., .001) # generally gives good results, but the higher order the expansion, the # further from 0 we need to evaluate the derivatives, or they blow up. point = dict(zip(range(1, 14), 2**np.linspace(-1, 5, 13) / 100)) vals = dict(a=coef_a, b=coef_b, n=len(self.model.response.data), r=point[self.taylor]) _deriv = [eval(x, globals(), vals) for x in self.deriv] # pylint: disable=eval-used # compute and return the approximate sigma def term(i, j): return (1 / np.math.factorial(i) * 1 / np.math.factorial(j) * _deriv[i] * _deriv[j] * (moment(i + j) - moment(i) * moment(j))) terms = [ term(i, j) for i in range(1, self.taylor + 1) for j in range(1, self.taylor + 1) ] return np.array(terms).sum()**0.5
def setup_class(cls): cls.res2 = results_st.results_poisson_clu mod = smd.Poisson(endog, exog) mod = GLM(endog, exog, family=families.Poisson()) cls.res1 = mod.fit() cls.get_robust_clu()
def __init__(self): from results.results_glm import Lbw self.res2 = Lbw() self.res1 = GLM(self.res2.endog, self.res2.exog, family=sm.families.Binomial()).fit()
def init(cls): cov_type = 'HC0' cls.res2 = cls.mod2.fit(cov_type=cov_type) mod = GLM(cls.endog, cls.exog, var_weights=cls.aweights) mod.exog_names[:] = ['const', 'x1', 'x2', 'x3', 'x4'] cls.res1 = mod.fit_constrained('x1=0.5', cov_type=cov_type)