def general_logistic_regression(): '''Example General Logistic Recression, Example 7.4.1, p. 135''' # Get the data inFile = r'GLM_data/Table 7.5 Embryogenic anthers.xls' df = get_data(inFile) # Define the variables so that they match Dobson df['n_y'] = df['n'] - df['y'] df['newstor'] = df['storage'] - 1 df['x'] = np.log(df['centrifuge']) # Model 1 model1 = smf.glm('n_y + y ~ newstor*x', data=df, family=sm_families.Binomial()).fit() print(model1.summary()) # Model 2 model2 = smf.glm('n_y + y ~ newstor+x', data=df, family=sm_families.Binomial()).fit() print(model2.summary()) # Model 3 model3 = smf.glm('n_y + y ~ x', data=df, family=sm_families.Binomial()).fit() print(model3.summary())
def logistic_regression(): '''Logistic regression example chapter 7.3, p 130 [tbd]: the cloglog values are inconsistent with those mentioned in the book. This is probably due to the specific definitions of "loglog" and "cloglog" in the respective languages. ''' inFile = r'GLM_data/Table 7.2 Beetle mortality.xls' df = get_data(inFile) # adjust the unusual column names in the Excel file colNames = [name.split(',')[1].lstrip() for name in df.columns.values] df.columns = colNames # fit the model df['tested'] = df['n'] df['killed'] = df['y'] df['survived'] = df['tested'] - df['killed'] model = smf.glm('survived + killed ~ x', data=df, family=sm_families.Binomial()).fit() print(model.summary()) print('-' * 65) print('Equivalent solution:') model = smf.glm('I(n - y) + y ~ x', data=df, family=sm_families.Binomial()).fit() print(model.summary()) # The fitted number of survivors can be obtained by fits = df['n'] * (1 - model.fittedvalues) print('Fits Logit:') print(fits) # The fits for other link functions are: model_probit = smf.glm('I(n - y) + y ~ x', data=df, family=sm_families.Binomial( sm_families.links.probit)).fit() print(model_probit.summary()) fits_probit = df['n'] * (1 - model_probit.fittedvalues) print('Fits Probit:') print(fits_probit) model_cll = smf.glm('I(n - y) + y ~ x', data=df, family=sm_families.Binomial( sm_families.links.cloglog)).fit() print(model_cll.summary()) fits_cll = df['n'] * (1 - model_cll.fittedvalues) print('Fits Extreme Value:') print(fits_cll)
def __init__(self, endog, exog, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None, vc_names=None): super(BinomialBayesMixedGLM, self).__init__(endog, exog, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, family=families.Binomial(), fep_names=fep_names, vcp_names=vcp_names, vc_names=vc_names) if not np.all(np.unique(endog) == np.r_[0, 1]): msg = "endog values must be 0 and 1, and not all identical" raise ValueError(msg)
def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2, vc_names=None): fam = families.Binomial() x = _BayesMixedGLM.from_formula(formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p, vc_names=vc_names) return BinomialBayesMixedGLM(endog=x.endog, exog_fe=x.exog_fe, exog_vc=x.exog_vc, ident=x.ident, vcp_p=x.vcp_p, fe_p=x.fe_p, fep_names=x.fep_names, vcp_names=x.vcp_names, vc_names=x.vc_names)
def setup_class(cls): df = data_bin res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit(attach_wls=True, atol=1e-10) cls.infl1 = res.get_influence() cls.infl0 = MLEInfluence(res)
def setup_class(cls): cls.idx = slice(None) # params sequence same as Stata #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0) cls.res2 = reslogit.results_constraint2_robust mod1 = GLM(spector_data.endog, spector_data.exog, family=families.Binomial()) # not used to match Stata for HC # nobs, k_params = mod1.exog.shape # k_params -= 1 # one constraint cov_type = 'HC0' cov_kwds = {'scaling_factor': 32 / 31} # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)} constr = 'x1 - x3 = 0' cls.res1m = mod1.fit_constrained(constr, cov_type=cov_type, cov_kwds=cov_kwds, atol=1e-10) R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants cls.res1 = fit_constrained(mod1, R, q, fit_kwds={ 'atol': 1e-10, 'cov_type': cov_type, 'cov_kwds': cov_kwds }) cls.constraints_rq = (R, q)
def fit_logistic(X_hold,Y_hold,Firth=False,resBase=None,LRtest=True): """ Fits a logistic regression model using standard (when Firth = False) or Firth's method (when Firth = True). resBase is the result of a previous call to a regression that is used to store data for Firth's method. LRtest indicates if the likelihood ratio test should be reported. """ if not Firth: res = GLM(Y_hold, X_hold, family=families.Binomial()).fit()#XXX Confirm this with logistic using older XXXX # AICc adjustment res.aicc = statsmodels.tools.eval_measures.aicc(res.llf, nobs=res.nobs, df_modelwc=res.df_model+1) # Correct BIC res.bic = statsmodels.tools.eval_measures.bic(res.llf, nobs=res.nobs, df_modelwc=res.df_model+1) else: if resBase is None: sys.stderr.write('resBase must be provided to do Firth regression\n') sys.exit(1) elif type(resBase) is not statsmodels.genmod.generalized_linear_model.GLMResultsWrapper: sys.stderr.write('resBase must be type statsmodels.genmod.generalized_linear_model.GLMResultsWrapper\n') sys.exit(2) else: res = resBase #Do Firth's logistic regression (rint, rbeta, rbse, rfitll, pi) = fit_firth(Y_hold, X_hold, start_vec = None) if LRtest: # LRT null_X = np.delete(arr=X_hold,obj=range(int(np.size(X_hold)/len(X_hold)))[1:int(np.size(X_hold)/len(X_hold))],axis=1) (null_intercept, null_beta, null_bse, null_fitll, null_pi) = fit_firth(Y_hold, null_X, start_vec = None) lrstat = -2.*(null_fitll - rfitll) lrt_pvalue = 1. if lrstat > 0.: # non-convergence lrt_pvalue = stats.chi2.sf(lrstat, 1) res.llnull = null_fitll res.lrstat = lrstat res.lrt_pval = lrt_pvalue # AICc adjustment for Firth model aicc = statsmodels.tools.eval_measures.aicc(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1]) # AIC aic = statsmodels.tools.eval_measures.aic(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1]) # BIC bic = statsmodels.tools.eval_measures.bic(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1]) #Store parameters, standard errors, likelihoods, and statistics rint = np.array([rint]) rbeta = np.array(rbeta) res.params = np.concatenate([rint,rbeta]) res.bse = rbse res.llf = rfitll res.aicc = aicc res.aic = aic res.bic = bic #Get Wald p vals for parameters res.pvalues = 1. - chi2.cdf(x=(res.params/res.bse)**2, df=1) #Add predicted y res.predict = pi return res
def iterate_logistic(X_hold,Y_hold, fixed_columns = [0], Firth=False): """ Fits logistic regression to the provided data while using the fixed_columns in the regression. Firth specifies if Firth regression should be used. Returns matrices of fitted betas, pvalues, aic, aicc (second order aic), and bic """ l = np.size(fixed_columns)+1 k = np.shape(X_hold)[1] betas = np.zeros([k,l]) pvalues = np.zeros([k,l]) aic = np.zeros([k,1]) aicc = np.zeros([k,1]) bic = np.zeros([k,1]) # Fit constant if Firth: null_X = np.delete(arr=X_hold,obj=range(int(np.size(X_hold)/len(X_hold)))[1:int(np.size(X_hold)/len(X_hold))],axis=1) (null_intercept, null_beta, null_bse, null_fitll, null_pi) = fit_firth(Y_hold, null_X, start_vec = None) #Using this as a way to return a model in the same class as GLM. res = GLM(Y_hold, null_X, family=families.Binomial()).fit() # AICc adjustment for Firth model res.aicc = statsmodels.tools.eval_measures.aicc(null_fitll, nobs=res.nobs, df_modelwc=res.df_model+1) # AIC res.aic = statsmodels.tools.eval_measures.aic(null_fitll, nobs=res.nobs, df_modelwc=res.df_model+1) # BIC res.bic = statsmodels.tools.eval_measures.bic(null_fitll, nobs=res.nobs, df_modelwc=res.df_model+1) #Store parameters, standard errors, likelihoods, and statistics res.params = np.array([null_intercept]) #Get Wald p vals for parameters res.pvalues = 1. - chi2.cdf(x=(res.params/null_bse)**2, df=1) else: res = fit_logistic(X_hold[:,0],Y_hold) betas[0,:] = res.params pvalues[0,:] = res.pvalues aic[0] = res.aic aicc[0] = res.aicc bic[0] = res.bic #Set variable for use later resBase = copy.deepcopy(res) NAN = ~np.isnan(X_hold).any(axis=0) for i in range(1,k): if NAN[i]: if i not in fixed_columns: columns = fixed_columns.copy() columns.append(i) res = fit_logistic(X_hold[:,columns],Y_hold, Firth=Firth, resBase=resBase,LRtest=False) betas[i,:] = res.params pvalues[i,:] = res.pvalues aic[i] = res.aic aicc[i] = res.aicc bic[i] = res.bic return betas, pvalues,aic,aicc,bic
def setup_class(cls): endog_bin = (endog > endog.mean()).astype(int) cls.cov_type = 'cluster' mod1 = GLM(endog_bin, exog, family=families.Binomial()) cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) mod1 = smd.Logit(endog_bin, exog) cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1, fe_p=2, fep_names=None, vcp_names=None, vc_names=None): super(BinomialBayesMixedGLM, self).__init__( endog=endog, exog_fe=exog_fe, exog_vc=exog_vc, ident=ident, vcp_p=vcp_p, fe_p=fe_p, family=families.Binomial(), fep_names=fep_names, vcp_names=vcp_names, vc_names=vc_names)
def setup_class(cls): df = data_bin mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()) res = mod.fit(method="newton", tol=1e-10) from statsmodels.discrete.discrete_model import Logit mod2 = Logit(df['constrict'], df[['const', 'log_rate', 'log_volumne']]) res2 = mod2.fit(method="newton", tol=1e-10) cls.infl1 = res.get_influence() cls.infl0 = res2.get_influence()
def setup_class(cls): endog_bin = (endog > endog.mean()).astype(int) cls.cov_type = 'cluster' mod1 = GLM(endog_bin, exog, family=families.Binomial(link=links.probit())) cls.res1 = mod1.fit(method='newton', cov_type='cluster', cov_kwds=dict(groups=group)) mod1 = smd.Probit(endog_bin, exog) cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group)) cls.rtol = 1e-6
def senility_and_WAIS(): '''Another example of logistic regression. chapter 7.8, p 143 [tbd]: I don't understand how the "Binomial model" (grouped response) is supposed to work, in either language''' inFile = r'GLM_data/Table 7.8 Senility and WAIS.xls' df = get_data(inFile) # ungrouped model = smf.glm('s ~ x', data=df, family=sm_families.Binomial()).fit() print(model.summary())
def setup_class(cls): yi = np.array([0, 2, 14, 19, 30]) ni = 40 * np.ones(len(yi)) xi = np.arange(1, len(yi) + 1) exog = np.column_stack((np.ones(len(yi)), xi)) endog = np.column_stack((yi, ni - yi)) res = GLM(endog, exog, family=families.Binomial()).fit() cls.infl1 = res.get_influence() cls.infl0 = MLEInfluence(res) cls.cd_rtol = 5e-5
def setup_class(cls): cls.idx = slice(None) # params sequence same as Stata #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0) cls.res2 = reslogit.results_constraint2 mod1 = GLM(spector_data.endog, spector_data.exog, family=families.Binomial()) constr = 'x1 - x3 = 0' cls.res1m = mod1.fit_constrained(constr, atol=1e-10) R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10}) cls.constraints_rq = (R, q)
def setup_class(cls): cls.idx = slice(None) # params sequence same as Stata cls.res2 = reslogit.results_constraint2 mod1 = GLM(spector_data.endog, spector_data.exog, family=families.Binomial()) constr = 'x1 - x3 = 0' cls.res1m = mod1.fit_constrained(constr, atol=1e-10) # patsy compatible constraints R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10}) cls.constraints_rq = (R, q)
def setup_class(cls): cls.idx = slice(None) # params sequence same as Stata, but Stata reports param = nan # and we have param = value = 0 cls.res2 = reslogit.results_constraint1 mod1 = GLM(spector_data.endog, spector_data.exog, family=families.Binomial()) constr = 'x1 = 2.8' cls.res1m = mod1.fit_constrained(constr) R, q = cls.res1m.constraints cls.res1 = fit_constrained(mod1, R, q)
def reply_analysis_report(data_input_path, data_output_path): reply_analysis = prepare_data_reply_analysis(data_input_path, data_output_path) score = reply_analysis.assign( tweet_negative_score=lambda df: df.apply( lambda x: x["tweet_score"] if x["tweet_label"] == "NEGATIVE" else 1 - x["tweet_score"], axis=1), trump_negative_score=lambda df: df.apply( lambda x: x["trump_score"] if x["trump_label"] == "NEGATIVE" else 1 - x["trump_score"], axis=1)) logits = scipy.special.logit( score[["negative_score_retweet", "negative_score_trump"]]) logits.plot(kind="scatter", x="negative_score_trump", y="negative_score_retweet", alpha=0.1) logits.save("plots/logit_sentiment_score.png") print( "Naive Sentiment Score Calculation", scipy.stats.pearsonr(logits["negative_score_trump"], logits["negative_score_retweet"])) tmp_data = reply_analysis[~reply_analysis["trump_label"].isnull()] x = sm.add_constant(tmp_data[[ # "created_at_trump_day", "created_at_trump_month", "created_at_trump_year", "followers_count_norm", "friends_count_norm", "listed_count_norm", "statuses_count_norm" ]]) res = GLM(tmp_data['trump_label'].astype("category").cat.codes, x, family=families.Binomial()).fit(attach_wls=True, atol=1e-10) print(res.summary()) print( "ATE", backdoor_binary_respose_ate(reply_analysis, "trump_label", "NEGATIVE", "tweet_label", "NEGATIVE", "trump_created_at", "tweet_created_at", datetime.timedelta(minutes=0), "1D"))
def probit_fit(x, resp): ''' Probit fit with 95% CIs ''' # binomial GLM with probit link model = GLM(resp, add_constant(x), family=families.Binomial(), link=families.links.probit()) mod_result = model.fit(disp=0) xt = np.linspace(np.min(x), np.max(x), 100) r_hat = mod_result.predict(add_constant(xt)) pred_summ = mod_result.get_prediction( add_constant(xt)).summary_frame(alpha=0.05) ci_5, ci_95 = pred_summ['mean_ci_lower'], pred_summ['mean_ci_upper'] return mod_result.params, r_hat, (xt, ci_5, ci_95)
def test_influence_glm_bernoulli(): # example uses Finney's data and is used in Pregibon 1981 df = data_bin results_sas = np.asarray(results_sas_df) res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit(attach_wls=True, atol=1e-10) infl = res.get_influence(observed=False) k_vars = 3 assert_allclose(infl.dfbetas, results_sas[:, 5:8], atol=1e-4) assert_allclose(infl.d_params, results_sas[:, 5:8] * res.bse.values, atol=1e-4) assert_allclose(infl.cooks_distance[0] * k_vars, results_sas[:, 8], atol=6e-5) assert_allclose(infl.hat_matrix_diag, results_sas[:, 4], atol=6e-5) c_bar = infl.cooks_distance[0] * 3 * (1 - infl.hat_matrix_diag) assert_allclose(c_bar, results_sas[:, 9], atol=6e-5)
def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2): fam = families.Binomial() x = _BayesMixedGLM.from_formula( formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p) # Copy over to the intended class structure mod = BinomialBayesMixedGLM( x.endog, x.exog, exog_vc=x.exog_vc, ident=x.ident, vcp_p=x.vcp_p, fe_p=x.fe_p, fep_names=x.fep_names, vcp_names=x.vcp_names, vc_names=x.vc_names) mod.data = x.data return mod
def setup_class(cls): vs = Independence() family = families.Binomial() np.random.seed(987126) Y = 1 * (np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) cls.result1 = mod1.fit() mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family) cls.result2 = mod2.fit(disp=False)
def test_score_binomial(miss_frac): np.random.seed(23424) n, p = 100, 5 for d in range(1, 5): # Generate the data icept = np.linspace(3, 5, p) fac = np.random.normal(size=(p, d)) fac, _, _ = np.linalg.svd(fac, 0) sc = np.random.normal(size=(n, d)) lp = np.dot(sc, fac.T) + icept mu = 1 / (1 + np.exp(-lp)) endog = (np.random.uniform(size=(n, p)) < mu).astype(np.float64) valid = (np.random.uniform(size=(n, p)) > miss_frac).astype(np.bool) pca = GPCA(endog, d, family=families.Binomial(), valid=valid) par = np.concatenate((icept, fac.ravel())) grad = pca.score(par) ngrad = nd.Gradient(pca.loglike)(par) assert_allclose(grad, ngrad, rtol=1e-4, atol=1e-4)
def __init__(self): self.setup_class() # why does nose do it properly from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families self.mod = lambda y, x: GLM(y, x, family=families.Binomial()) self.y = self.y_bin
def exercise7d1(): '''Logistic regression example chapter 7.3, p 130 [tbd]: the cloglog values are inconsistent with those mentioned in the book. This is probably due to the specific definitions of "loglog" and "cloglog" in the respective languages. ''' inFile = r'GLM_data/Table 7.11 Hiroshima deaths.xls' df = get_data(inFile) df['radBin'] = np.array([0,1,10,50,100,200]) # adjust the unusual column names in the Excel file colNames = df.columns.values colNames[2] = 'other' colNames[3] = 'total' df.columns = colNames df['n'] = df['total'] df['y'] = df['leukemia'] model = smf.glm('other + leukemia ~ radBin', data=df, family=sm_families.Binomial()).fit() print(model.summary()) print('-'*65) print('Equivalent solution:') model = smf.glm('I(n - y) + y ~ radBin', data=df, family=sm_families.Binomial()).fit() print(model.summary()) # The fitted number of survivors can be obtained by fits = df['n']*(1-model.fittedvalues) print('Fits Logit:') print(fits) # The fits for other link functions are: model_probit = smf.glm('I(n - y) + y ~ radBin', data=df, family=sm_families.Binomial(sm_families.links.probit)).fit() print(model_probit.summary()) fits_probit = df['n']*(1-model_probit.fittedvalues) print('Fits Probit:') print(fits_probit) model_cll = smf.glm('I(n - y) + y ~ radBin', data=df, family=sm_families.Binomial(sm_families.links.cloglog)).fit() print(model_cll.summary()) fits_cll = df['n']*(1-model_cll.fittedvalues) print('Fits Extreme Value:') print(fits_cll) x = np.arange(201) y = np.exp(model.params[0]+model.params[1]*x)/(1+np.exp(model.params[0]+model.params[1]*x)) yUp = np.exp(model.params[0]+2*model.bse[0]+(model.params[1]+2*model.bse[1])*x)/(1+np.exp(model.params[0]+2*model.bse[0]+(model.params[1]+2*model.bse[1])*x)) yLo = np.exp(model.params[0]-2*model.bse[0]+(model.params[1]-2*model.bse[1])*x)/(1+np.exp(model.params[0]-2*model.bse[0]+(model.params[1]-2*model.bse[1])*x)) plt.plot(df['radBin'],df['leukemia']/df['total'],'.') plt.plot(x,1-y,linewidth=0.5,color='black',label='Logit Fit') plt.plot(x,1-yUp,'--',linewidth=0.5,color='red',label='2 Sigma Bounds') plt.plot(x,1-yLo,'--',linewidth=0.5,color='red') plt.title('Logit Fit') plt.xlabel('Radiation Dose') plt.ylabel('Probability of death from Leukemia') plt.legend() plt.figure() y = norm.cdf(model_probit.params[0]+model_probit.params[1]*x) yUp = norm.cdf(model_probit.params[0]+2*model_probit.bse[0]+(model_probit.params[1]+2*model_probit.bse[1])*x) yLo = norm.cdf(model_probit.params[0]-2*model_probit.bse[0]+(model_probit.params[1]-2*model_probit.bse[1])*x) plt.plot(df['radBin'],df['leukemia']/df['total'],'.') plt.plot(x,1-y,linewidth=0.5,color='black',label='Probit Fit') plt.plot(x,1-yUp,'--',linewidth=0.5,color='red',label='2 Sigma Bounds') plt.plot(x,1-yLo,'--',linewidth=0.5,color='red') plt.title('Probit Fit') plt.xlabel('Radiation Dose') plt.ylabel('Probability of death from Leukemia') plt.legend() plt.figure() y = 1-np.exp(-(np.exp(model_cll.params[0]+model_cll.params[1]*x))) yUp = 1-np.exp(-(np.exp(model_cll.params[0]+2*model_cll.bse[0]+(model_cll.params[1]+2*model_cll.bse[1])*x))) yLo = 1-np.exp(-(np.exp(model_cll.params[0]-2*model_cll.bse[0]+(model_cll.params[1]-2*model_cll.bse[1])*x))) plt.plot(df['radBin'],df['leukemia']/df['total'],'.') plt.plot(x,1-y,linewidth=0.5,color='black',label='CLL Fit') plt.plot(x,1-yUp,'--',linewidth=0.5,color='red',label='2 Sigma Bounds') plt.plot(x,1-yLo,'--',linewidth=0.5,color='red') plt.title('CLL Fit') plt.xlabel('Radiation Dose') plt.ylabel('Probability of death from Leukemia') plt.legend() pdb.set_trace()
def __init__(self, fam, nb_theta=None, mult_n=None): if fam == "poi": self.family = smf.Poisson() elif fam == "nb": if nb_theta is None: raise GlmpcaError( "Negative binomial dispersion parameter 'nb_theta' must be specified" ) self.family = smf.NegativeBinomial(alpha=1 / nb_theta) elif fam in ("mult", "bern"): self.family = smf.Binomial() if fam == "mult" and mult_n is None: raise GlmpcaError( "Multinomial sample size parameter vector 'mult_n' must be specified" ) else: raise GlmpcaError("unrecognized family type") #variance function, determined by GLM family vfunc = self.family.variance #inverse link func, mu as a function of linear predictor R ilfunc = self.family.link.inverse #derivative of inverse link function, dmu/dR hfunc = self.family.link.inverse_deriv self.glmpca_fam = fam if fam == "poi": def infograd(Y, R): M = ilfunc(R) #ilfunc=exp return {"grad": (Y - M), "info": M} elif fam == "nb": def infograd(Y, R): M = ilfunc(R) #ilfunc=exp W = 1 / vfunc(M) return {"grad": (Y - M) * W * M, "info": W * (M**2)} self.nb_theta = nb_theta elif fam == "mult": def infograd(Y, R): P = ilfunc(R) #ilfunc=expit, P very small probabilities return {"grad": Y - (mult_n * P), "info": mult_n * vfunc(P)} self.mult_n = mult_n elif fam == "bern": def infograd(Y, R): P = ilfunc(R) return {"grad": Y - P, "info": vfunc(P)} else: #this is not actually used but keeping for future reference #this is most generic formula for GLM but computationally slow raise GlmpcaError("invalid fam") def infograd(Y, R): M = ilfunc(R) W = 1 / vfunc(M) H = hfunc(R) return {"grad": (Y - M) * W * H, "info": W * (H**2)} self.infograd = infograd #create deviance function if fam == "mult": def dev_func(Y, R): return mat_binom_dev(Y, ilfunc(R), mult_n) else: def dev_func(Y, R): return self.family.deviance(Y, ilfunc(R)) self.dev_func = dev_func
def mod(y, x): return GLM(y, x, family=families.Binomial())
from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families import statsmodels.stats.tests.test_influence test_module = statsmodels.stats.tests.test_influence.__file__ cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module)) file_name = 'binary_constrict.csv' file_path = os.path.join(cur_dir, 'results', file_name) df = pd.read_csv(file_path, index_col=0) res = GLM( df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit( attach_wls=True, atol=1e-10) print(res.summary()) # ## get the influence measures # # GLMResults has a `get_influence` method similar to OLSResults, that # returns and instance of the GLMInfluence class. This class has methods and # (cached) attributes to inspect influence and outlier measures. # # This measures are based on a one-step approximation to the the results # for deleting one observation. One-step approximations are usually accurate # for small changes but underestimate the magnitude of large changes. Event # though large changes are underestimated, they still show clearly the # effect of influential observations #
plt.rc("figure", figsize=(16, 8)) plt.rc("font", size=14) import statsmodels.stats.tests.test_influence test_module = statsmodels.stats.tests.test_influence.__file__ cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module)) file_name = "binary_constrict.csv" file_path = os.path.join(cur_dir, "results", file_name) df = pd.read_csv(file_path, index_col=0) res = GLM( df["constrict"], df[["const", "log_rate", "log_volumne"]], family=families.Binomial(), ).fit(attach_wls=True, atol=1e-10) print(res.summary()) # ## get the influence measures # # GLMResults has a `get_influence` method similar to OLSResults, that # returns and instance of the GLMInfluence class. This class has methods and # (cached) attributes to inspect influence and outlier measures. # # This measures are based on a one-step approximation to the the results # for deleting one observation. One-step approximations are usually accurate # for small changes but underestimate the magnitude of large changes. Event # though large changes are underestimated, they still show clearly the # effect of influential observations #
if example == 1: print "normal" m = AdditiveModel(d) m.fit(y) x = np.linspace(-2,2,50) print m import scipy.stats, time if example == 2: print "binomial" mod_name = 'Binomial' f = families.Binomial() #b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(y)]) b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(z)]) b.shape = y.shape m = GAM(b, d, family=f) toc = time.time() m.fit(b) tic = time.time() print tic-toc #for plotting yp = f.link.inverse(y) p = b if example == 3: print "Poisson"