def test_custom_models(self, sdata): model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' logd = LogisticRegression(penalty='l1', C=1.0, random_state=203) ipt = IPTW(sdata, treatment='art', standardize='unexposed', stabilized=True) ipt.regression_models(model, custom_model_denominator=logd) ipt.fit() sdata['iptw'] = ipt.Weight # Estimating GEE ind = sm.cov_struct.Independence() f = sm.families.family.Binomial(sm.families.links.identity) smf.gee('dead ~ art', sdata['id'], sdata, cov_struct=ind, family=f, weights=sdata['iptw']).fit()
def outcome_model(self, model, print_results=True): """Build the model for the outcome. This is also referred to at the Q-model. This must be specified before the fit function. If it is not, an error will be raised. model: -variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Model form should contain the exposure. Format is the same as the functional form, i.e. 'var1 + var2 + var3 + var4' print_results: -whether to print the logistic regression results to the terminal. Default is True """ if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial(sm.families.links.logit) else: linkdist = sm.families.family.Gaussian(sm.families.links.identity) # Modeling the outcome if self._weights is None: m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist) self.outcome_model = m.fit() else: m = smf.gee(self.outcome + ' ~ ' + model, self.gf.index, self.gf, family=linkdist, weights=self.gf[self._weights]) self.outcome_model = m.fit() # Printing results of the model and if any observations were dropped if print_results is True: print(self.outcome_model.summary()) self.model_fit = True
def modeler(model, lists, linkdist=sm.families.family.Poisson()): global df, true_direct_ve try: # Modified Poisson Regression Model ind = sm.cov_struct.Independence() log = smf.gee(model, 'id', df, family=linkdist, cov_struct=ind).fit() # Estimated Direct Effect dvebeta = log.params[1] # Estimated Standard Error dvese = log.bse[1] # Estimated Confidence Intervals dlcl = log.conf_int().loc['Vac'][0] ducl = log.conf_int().loc['Vac'][1] if ((dlcl < true_direct_ve) & (ducl > true_direct_ve)): dciv = 1 else: dciv = 0 dclr = np.exp(ducl) / np.exp(dlcl) # Adding results to the end of storage lists lists[0].append(dvebeta) lists[1].append(dvese) lists[2].append(dciv) lists[3].append(dclr) # If model doesn't converge, add NaN to list except: lists[0].append(np.nan) lists[1].append(np.nan) lists[2].append(np.nan) lists[3].append(np.nan)
def outcome_model(self, model, print_results=True): """Build the model for the outcome. This is also referred to at the Q-model. This must be specified before the fit function. If it is not, an error will be raised. Parameters ---------- model : str Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male' print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial() elif self.outcome_type == 'normal': linkdist = sm.families.family.Gaussian() else: linkdist = sm.families.family.Poisson() # Modeling the outcome if self._weights is None: m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist) self._outcome_model = m.fit() else: m = smf.gee(self.outcome + ' ~ ' + model, self.gf.index, self.gf, family=linkdist, weights=self.gf[self._weights]) self._outcome_model = m.fit() # Printing results of the model and if any observations were dropped if print_results: print(self._outcome_model.summary())
def censoring_model(self, model, restriction=None, print_results=True): """Add a specified regression model for censoring. Specifying this model is optional, but is recommended when censoring occurs in your data set. Otherwise, you will be assuming non-informative censoring Parameters ---------- model: Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format follows patsy standards For example) 'var1 + var2 + var3 + var4' restriction : str, optional Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1" print_results : bool, optional Whether to print the logistic regression model results to the terminal. Default is True """ g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() linkdist = sm.families.family.Binomial() if self._weights is None: # Unweighted g-formula self.cens_model = smf.glm('__uncensored__ ~ ' + model, g, family=linkdist).fit() else: # Weighted g-formula self.cens_model = smf.gee('__uncensored__ ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist).fit() if print_results: print(self.cens_model.summary()) self._censor_model_fit = True
def test_match_sas_smr_u_stabilized(self, sdata): sas_rd = -0.080048197 sas_rd_ci = -0.153567335, -0.006529058 model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ipt = IPTW(sdata, treatment='art', standardize='unexposed', stabilized=True) ipt.regression_models(model) ipt.fit() sdata['iptw'] = ipt.Weight # Estimating GEE ind = sm.cov_struct.Independence() f = sm.families.family.Binomial(sm.families.links.identity) linrisk = smf.gee('dead ~ art', sdata['id'], sdata, cov_struct=ind, family=f, weights=sdata['iptw']).fit() npt.assert_allclose(linrisk.params[1], sas_rd, rtol=1e-5) npt.assert_allclose( (linrisk.conf_int()[0][1], linrisk.conf_int()[1][1]), sas_rd_ci, rtol=1e-4)
def exposure_model(self, model, restriction=None, print_results=True): """Add a specified regression model for the exposure. This is used for natural course estimation of the Monte Carlo g-formula. This must be specified before calling the fit function. Parameters ---------- model : str Variables to include in the model for predicting the exposure. Must be contained within the input pandas dataframe when initialized. Format follows patsy standards For example) 'var1 + var2 + var3 + var4' restriction : str, optional Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1" print_results : bool, optional Whether to print the logistic regression model results to the terminal. Default is True """ g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() linkdist = sm.families.family.Binomial() if self._weights is None: # Unweighted g-formula self.exp_model = smf.glm(self.exposure + ' ~ ' + model, g, family=linkdist).fit() else: # Weighted g-formula self.exp_model = smf.gee(self.exposure + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist).fit() if print_results: print(self.exp_model.summary()) self._exposure_model_fit = True
def test_match_sas_unstabilized(self, sdata): sas_w_sum = 1038.051 sas_rd = -0.081519085 sas_rd_ci = -0.156199938, -0.006838231 model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ipt = IPTW(sdata, treatment='art', stabilized=False) ipt.regression_models(model) ipt.fit() sdata['iptw'] = ipt.Weight npt.assert_allclose(np.sum(sdata.dropna()['iptw']), sas_w_sum, rtol=1e-4) # Estimating GEE ind = sm.cov_struct.Independence() f = sm.families.family.Binomial(sm.families.links.identity) linrisk = smf.gee('dead ~ art', sdata['id'], sdata, cov_struct=ind, family=f, weights=sdata['iptw']).fit() npt.assert_allclose(linrisk.params[1], sas_rd, rtol=1e-5) npt.assert_allclose( (linrisk.conf_int()[0][1], linrisk.conf_int()[1][1]), sas_rd_ci, rtol=1e-4)
def exposure_model(self, model, restriction=None, print_results=True): """Build the model for the exposure. This must be specified before the fit function. If it is not, an error will be raised. model: -variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format is the same as the functional form Example) 'var1 + var2 + var3 + var4' restriction: -used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g' Example) "g['art']==1" print_results: -whether to print the logistic regression results to the terminal. Default is True """ g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() linkdist = sm.families.family.Binomial(sm.families.links.logit) if self._weights is None: # Unweighted g-formula self.exp_model = smf.glm(self.exposure + ' ~ ' + model, g, family=linkdist).fit() else: # Weighted g-formula self.exp_model = smf.gee(self.exposure + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist).fit() if print_results: print(self.exp_model.summary()) self._exposure_model_fit = True
def RR(formula, idvar, df, printOutput=True): """Performs relative risk regression for dichotomous outcomes. Uses a working poisson model and an empirical ("robust") variance estimator. **Arguments**: \n 1) formula - a formula expression for the model. 2) idvar - an identifier for each indepent observation of the data (typically a row). 3) df - the name of the pandas dataframe. 4) printOutput - a boolean argument for whether the function should print the output. **Example Code**: \n import pandas as pd \n carrot = pd.read_stata("https://stats.idre.ucla.edu/stat/stata/faq/eyestudy.dta") \n RR("lenses ~ carrot + gender + latitude", "id", carrot) \n # Note: this choice of reference category is different than the IDRE analysis **References**: \n Lumley, T., Kronmal, R., & Ma, S. (2006). Relative risk regression in medical research: models, contrasts, estimators, and algorithms. """ gee = smf.gee(formula, idvar, df, family=sm.families.Poisson()) results = gee.fit() if printOutput: print("Relative Risk Regression") print( "-----------------------------------------------------------------------------------" ) print(results.summary()) fits = results.fittedvalues if printOutput: print("Additional diagnostics:") print(sum(fits > 1), "observations have fitted probabilities greater than one") print((sum(fits > 1) / len(fits)) * 100, "% of observations have fitted probabilities greater than one") print( "==============================================================================" ) print("Relative Risk:") RRs = results.params RRs = np.exp(RRs) RRs = RRs.to_frame() RRs = RRs.rename(columns={0: 'RR'}) RR.rrs = RRs.drop(RRs.index[[0]]) if printOutput: print(RR.rrs) print( "------------------------------------------------------------------------------" ) print("95% Confidence Intervals for Relative Risk:") CIs = results.conf_int() CIs = CIs.rename(columns={0: 'LCL', 1: 'UCL'}) CIs = np.exp(CIs) RR.ci = CIs.drop(CIs.index[[0]]) if printOutput: print(RR.ci) print( "==============================================================================" )
def test_missing(): # gh-1877 data = [['id', 'al', 'status', 'fake', 'grps'], ['4A', 'A', 1, 1, 0], ['5A', 'A', 1, 2.0, 1], ['6A', 'A', 1, 3, 2], ['7A', 'A', 1, 2.0, 3], ['8A', 'A', 1, 1, 4], ['9A', 'A', 1, 2.0, 5], ['11A', 'A', 1, 1, 6], ['12A', 'A', 1, 2.0, 7], ['13A', 'A', 1, 1, 8], ['14A', 'A', 1, 1, 9], ['15A', 'A', 1, 1, 10], ['16A', 'A', 1, 2.0, 11], ['17A', 'A', 1, 3.0, 12], ['18A', 'A', 1, 3.0, 13], ['19A', 'A', 1, 2.0, 14], ['20A', 'A', 1, 2.0, 15], ['2C', 'C', 0, 3.0, 0], ['3C', 'C', 0, 1, 1], ['4C', 'C', 0, 1, 2], ['5C', 'C', 0, 2.0, 3], ['6C', 'C', 0, 1, 4], ['9C', 'C', 0, 1, 5], ['10C', 'C', 0, 3, 6], ['12C', 'C', 0, 3, 7], ['14C', 'C', 0, 2.5, 8], ['15C', 'C', 0, 1, 9], ['17C', 'C', 0, 1, 10], ['22C', 'C', 0, 1, 11], ['23C', 'C', 0, 1, 12], ['24C', 'C', 0, 1, 13], ['32C', 'C', 0, 2.0, 14], ['35C', 'C', 0, 1, 15]] df = pd.DataFrame(data[1:], columns=data[0]) df.ix[df.fake == 1, 'fake'] = np.nan mod = smf.gee('status ~ fake', data=df, groups='grps', cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) df = df.dropna() #df.loc[:, 'constant'] = 1 df['constant'] = 1 mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps, cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) assert_equal(mod.endog, mod2.endog) assert_equal(mod.exog, mod2.exog) assert_equal(mod.groups, mod2.groups) res = mod.fit() res2 = mod2.fit() assert_almost_equal(res.params.values, res2.params.values)
def test_missing(): # gh-1877 data = [['id', 'al', 'status', 'fake', 'grps'], ['4A', 'A', 1, 1, 0], ['5A', 'A', 1, 2.0, 1], ['6A', 'A', 1, 3, 2], ['7A', 'A', 1, 2.0, 3], ['8A', 'A', 1, 1, 4], ['9A', 'A', 1, 2.0, 5], ['11A', 'A', 1, 1, 6], ['12A', 'A', 1, 2.0, 7], ['13A', 'A', 1, 1, 8], ['14A', 'A', 1, 1, 9], ['15A', 'A', 1, 1, 10], ['16A', 'A', 1, 2.0, 11], ['17A', 'A', 1, 3.0, 12], ['18A', 'A', 1, 3.0, 13], ['19A', 'A', 1, 2.0, 14], ['20A', 'A', 1, 2.0, 15], ['2C', 'C', 0, 3.0, 0], ['3C', 'C', 0, 1, 1], ['4C', 'C', 0, 1, 2], ['5C', 'C', 0, 2.0, 3], ['6C', 'C', 0, 1, 4], ['9C', 'C', 0, 1, 5], ['10C', 'C', 0, 3, 6], ['12C', 'C', 0, 3, 7], ['14C', 'C', 0, 2.5, 8], ['15C', 'C', 0, 1, 9], ['17C', 'C', 0, 1, 10], ['22C', 'C', 0, 1, 11], ['23C', 'C', 0, 1, 12], ['24C', 'C', 0, 1, 13], ['32C', 'C', 0, 2.0, 14], ['35C', 'C', 0, 1, 15]] df = pd.DataFrame(data[1:], columns=data[0]) df.ix[df.fake == 1, 'fake'] = np.nan mod = smf.gee('status ~ fake', data=df, groups='grps', cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) df = df.dropna() df['constant'] = 1 mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps, cov_struct=sm.cov_struct.Independence(), family=sm.families.Binomial()) assert_equal(mod.endog, mod2.endog) assert_equal(mod.exog, mod2.exog) assert_equal(mod.groups, mod2.groups) res = mod.fit() res2 = mod2.fit() assert_almost_equal(res.params.values, res2.params.values)
def trend_model(df, group_var, formula): """ Trend modeling with generalized estimaing equations, accounting for dependency structure (nesting) within journal """ gee_fit = smf.gee(formula, group_var, data=df, family=sm.families.Binomial()).fit() return gee_fit
def generalized_estimating_equation_example(): data = sm.datasets.get_rdataset('epil', package='MASS').data fam = sm.families.Poisson() ind = sm.cov_struct.Exchangeable() mod = smf.gee('y ~ age + trt + base', 'subject', data, cov_struct=ind, family=fam) res = mod.fit() print(res.summary())
def outcome_model(self, model, continuous_distribution='gaussian', print_results=True): r"""Specify the outcome model. Model used to predict the outcome via a logistic regression model .. math:: \widehat{\Pr}(Y|A,L) = logit^{-1}(\widehat{\beta_0} + \widehat{\beta_1} A + \widehat{\beta} L) Parameters ---------- model : str Independent variables to predict the outcome. For example, 'var1 + var2 + var3 + var4' continuous_distribution : str, optional Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson' for Poisson distributions print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._out_model = self._outcome + ' ~ ' + model if self._continuous_outcome: if (continuous_distribution == 'gaussian') or (continuous_distribution == 'normal'): f = sm.families.family.Gaussian() elif continuous_distribution == 'poisson': f = sm.families.family.Poisson() else: raise ValueError("Only 'gaussian' and 'poisson' distributions are supported") else: f = sm.families.family.Binomial() if self._weight_ is None: log = smf.glm(self._out_model, self.df, family=f).fit() else: log = smf.gee(self._out_model, self.df.index, self.df, weights=self.df[self._weight_], family=f).fit() if print_results: print('\n----------------------------------------------------------------') print('MODEL: ' + self._out_model) print('-----------------------------------------------------------------') print(log.summary()) dfx = self.df.copy() dfx[self._exposure] = 1 self.df['_pY1_'] = log.predict(dfx) dfx = self.df.copy() dfx[self._exposure] = 0 self.df['_pY0_'] = log.predict(dfx) self._fit_outcome_ = True
def outcome_model(self, model, restriction=None, print_results=True): """Add a specified regression model for the outcome. Must be specified before the fit function. Parameters ---------- model: Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format follows patsy standards For example) 'var1 + var2 + var3 + var4' restriction : str, optional Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1" print_results : bool, optional Whether to print the logistic regression model results to the terminal. Default is True """ g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() linkdist = sm.families.family.Binomial() if self._weights is None: # Unweighted g-formula if self._competing_event: self.out_model = sm.MNLogit.from_formula( self.outcome + ' ~ ' + model, g).fit() else: self.out_model = smf.glm(self.outcome + ' ~ ' + model, g, family=linkdist).fit() else: # Weighted g-formula if self._competing_event: raise ValueError( "The weighted MonteCarloGFormula is not supported for competing events" ) self.out_model = smf.gee(self.outcome + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist).fit() if print_results: print(self.out_model.summary()) self._outcome_model_fit = True
def test_match_iptw_continuous(self, cdata): model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' cdata = cdata.dropna().copy() # Estimating Marginal Structural Model ipt = IPTW(cdata, treatment='art', stabilized=False) ipt.regression_models(model) ipt.fit() cdata['iptw'] = ipt.Weight ind = sm.cov_struct.Independence() f = sm.families.family.Gaussian() linrisk = smf.gee('cd4_wk45 ~ art', cdata['id'], cdata, cov_struct=ind, family=f, weights=cdata['iptw']).fit() # Estimating 'Stochastic Treatment' sipw = StochasticIPTW(cdata, treatment='art', outcome='cd4_wk45') sipw.treatment_model(model='male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) sipw.fit(p=1.0) r_all = sipw.marginal_outcome sipw.fit(p=0.0) r_non = sipw.marginal_outcome npt.assert_allclose(linrisk.params[1], r_all - r_non, atol=1e-4)
def fit(self, X, y=None): #Same settings as the documentation's example: self.fam = sm.families.Poisson() self.ind = sm.cov_struct.Exchangeable() #Auxiliary function: only used in this method within the class def expand_X(X, y, desired_group): X_plus = X.copy() X_plus['y'] = y #roughly make ten groups X_plus[desired_group + '_group'] = (X_plus[desired_group] * 10) // 10 return X_plus #save the seen class labels self.class_labels = np.unique(y) dataframe_feature_names = X.columns not_group_by_features = [ x for x in dataframe_feature_names if x != self.group_by_feature ] formula_in = 'y ~ ' + ' + '.join(not_group_by_features) data = expand_X(X, y, self.group_by_feature) self.mod = smf.gee(formula_in, self.group_by_feature + "_group", data, cov_struct=self.ind, family=self.fam) self.res = self.mod.fit() return self
import numpy as np import statsmodels.api as sm import statsmodels.formula.api as smf import math import pandas as pd import random # data = sm.datasets.get_rdataset("epil", "MASS").data # print(data) # md = smf.gee("y ~ age + trt + base", "subject", data, # cov_struct=sm.cov_struct.Independence(), # family=sm.families.Poisson()) # mdf = md.fit() # print(mdf.summary()) y=[] x2=[[xx] for xx in np.arange(0.,100.,1.)] for kk in x2: y.append(0.5*kk[0]+3+10*math.sin(kk[0]/3.14)*random.random()) panddf=pd.DataFrame({'x2':x2,'y':y}) md2 = smf.gee("x2", "y", panddf, cov_struct=sm.cov_struct.Independence(), family=sm.families.Poisson()) mdf2 = md2.fit() print(mdf2.summary())
"Smokers F", "Smokers M", "Hospital beds", "Life expectancy", "HDI") # plot specifications ax = sns.heatmap(corr, vmin=-1, vmax=1, center=0, cmap="Spectral", square=True) ax.set_xticklabels(labels, size=8, rotation=25, horizontalalignment="right") ax.set_yticklabels(labels, size=8, rotation=25, verticalalignment="top") plt.title("Correlation Covid-related Variables", fontsize=18) plt.show() plt.savefig(fname="./Plots/corrmatrix.png", dpi=1080) plt.close() # gee models fam = sm.families.Gaussian() ind = sm.cov_struct.Exchangeable() mod_gee = smf.gee( "CFR_log ~ stringency_index + population_density + median_age + aged_65_older + aged_70_older + gdp_per_capita + cardiovasc_death_rate + diabetes_prevalence + female_smokers + male_smokers + hospital_beds_per_thousand + life_expectancy + human_development_index", "location", data=data, cov_struct=ind, family=fam) result_gee = mod_gee.fit() print(result_gee.summary()) mod_gee2 = smf.gee( "case_fatality_ratio ~ stringency_index + population_density + median_age + aged_65_older + aged_70_older + gdp_per_capita + cardiovasc_death_rate + diabetes_prevalence + female_smokers + male_smokers + hospital_beds_per_thousand + life_expectancy + human_development_index", "location", data=data, cov_struct=ind, family=fam) result_gee2 = mod_gee2.fit() print(result_gee2.summary()) mod_gee3 = smf.gee(
def fit(self, continuous_distribution='gaussian'): """Fit the specified marginal structural model using the calculated inverse probability of treatment weights. """ if self.__mdenom is None: raise ValueError( 'No model has been fit to generated predicted probabilities') if self.ms_model is None: raise ValueError('No marginal structural model has been specified') if self._miss_flag and not self._fit_missing_: warnings.warn( "All missing outcome data is assumed to be missing completely at random. To relax this " "assumption to outcome data is missing at random please use the `missing_model()` " "function", UserWarning) ind = sm.cov_struct.Independence() full_msm = self.outcome + ' ~ ' + self.ms_model df = self.df.copy() if self.ipmw is None: if self._weight_ is None: df['_ipfw_'] = self.iptw else: df['_ipfw_'] = self.iptw * self.df[self._weight_] else: if self._weight_ is None: df['_ipfw_'] = self.iptw * self.ipmw else: df['_ipfw_'] = self.iptw * self.ipmw * self.df[self._weight_] df = df.dropna() if self._continuous_outcome: if (continuous_distribution == 'gaussian') or (continuous_distribution == 'normal'): f = sm.families.family.Gaussian() elif continuous_distribution == 'poisson': f = sm.families.family.Poisson() else: raise ValueError( "Only 'gaussian' and 'poisson' distributions are supported" ) self._continuous_y_type = continuous_distribution fm = smf.gee(full_msm, df.index, df, cov_struct=ind, family=f, weights=df['_ipfw_']).fit() self.average_treatment_effect = pd.DataFrame() self.average_treatment_effect['labels'] = np.asarray( fm.params.index) self.average_treatment_effect.set_index(keys=['labels'], inplace=True) self.average_treatment_effect['ATE'] = np.asarray(fm.params) self.average_treatment_effect['SE(ATE)'] = np.asarray(fm.bse) self.average_treatment_effect['95%LCL'] = np.asarray( fm.conf_int()[0]) self.average_treatment_effect['95%UCL'] = np.asarray( fm.conf_int()[1]) else: # Ignoring DomainWarnings from statsmodels with warnings.catch_warnings(): warnings.simplefilter('ignore', DomainWarning) # Estimating Risk Difference f = sm.families.family.Binomial(sm.families.links.identity()) fm = smf.gee(full_msm, df.index, df, cov_struct=ind, family=f, weights=df['_ipfw_']).fit() self.risk_difference = pd.DataFrame() self.risk_difference['labels'] = np.asarray(fm.params.index) self.risk_difference.set_index(keys=['labels'], inplace=True) self.risk_difference['RD'] = np.asarray(fm.params) self.risk_difference['SE(RD)'] = np.asarray(fm.bse) self.risk_difference['95%LCL'] = np.asarray(fm.conf_int()[0]) self.risk_difference['95%UCL'] = np.asarray(fm.conf_int()[1]) # Estimating Risk Ratio f = sm.families.family.Binomial(sm.families.links.log()) fm = smf.gee(full_msm, df.index, df, cov_struct=ind, family=f, weights=df['_ipfw_']).fit() self.risk_ratio = pd.DataFrame() self.risk_ratio['labels'] = np.asarray(fm.params.index) self.risk_ratio.set_index(keys=['labels'], inplace=True) self.risk_ratio['RR'] = np.exp(np.asarray(fm.params)) self.risk_ratio['SE(log(RR))'] = np.asarray(fm.bse) self.risk_ratio['95%LCL'] = np.exp(np.asarray( fm.conf_int()[0])) self.risk_ratio['95%UCL'] = np.exp(np.asarray( fm.conf_int()[1])) # Estimating Odds Ratio f = sm.families.family.Binomial() fm = smf.gee(full_msm, df.index, df, cov_struct=ind, family=f, weights=df['_ipfw_']).fit() self.odds_ratio = pd.DataFrame() self.odds_ratio['labels'] = np.asarray(fm.params.index) self.odds_ratio.set_index(keys=['labels'], inplace=True) self.odds_ratio['OR'] = np.exp(np.asarray(fm.params)) self.odds_ratio['SE(log(OR))'] = np.asarray(fm.bse) self.odds_ratio['95%LCL'] = np.exp(np.asarray( fm.conf_int()[0])) self.odds_ratio['95%UCL'] = np.exp(np.asarray( fm.conf_int()[1]))
globwarm = globwarm.dropna() X = sm.add_constant(globwarm.iloc[:, 1:9]) gmod = sm.GLSAR(globwarm.nhtemp, X, rho=1) res = gmod.iterative_fit(maxiter=6) gmod.rho gmod = sm.GLSAR(globwarm.nhtemp, X, rho=1) for i in range(6): results = gmod.fit() print("AR coefficients: {0}".format(gmod.rho)) rho, sigma = sm.regression.yule_walker(results.resid, order=gmod.order) gmod = sm.GLSAR(globwarm.nhtemp, X, rho) oatvar = pd.read_csv("oatvar.csv", index_col=0) oatvar['variety'] = oatvar['variety'].astype('category') oatvar['grams'] = oatvar['yield'] oatvar.head() mmod = smf.mixedlm("grams ~ variety", oatvar, groups=oatvar['block']).fit() mmod.summary() ind = sm.cov_struct.Exchangeable() gmod = smf.gee("grams ~ variety", "block", oatvar, cov_struct=ind).fit() gmod.summary() ind.summary() fpe = pd.read_csv("fpe.csv", index_col=0) fpe.head()
import statsmodels.api as sm import statsmodels.formula.api as smf data = sm.datasets.get_rdataset('epil', package='MASS').data fam = sm.families.Poisson() ind = sm.cov_struct.Exchangeable() mod = smf.gee("y ~ age + trt + base", "subject", data, cov_struct=ind, family=fam) res = mod.fit() print(res.summary())
rd_results.append(r_all - r_none) rr_results.append(r_all / r_none) print('RD 95% CI:', np.percentile(rd_results, q=[2.5, 97.5])) print('RR 95% CI:', np.percentile(rr_results, q=[2.5, 97.5])) #IPTW model = 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' df['iptw'] = ze.ipw.iptw(df, treatment='art', model_denominator=model, stabilized=True) ind = sm.cov_struct.Independence() f = sm.families.family.Binomial(sm.families.links.identity) linrisk = smf.gee('dead ~ art', df['id'], df, cov_struct=ind, family=f, weights=df['iptw']).fit() linrisk.summary() f = sm.families.family.Binomial(sm.families.links.log) log = smf.gee('dead ~ art', df['id'], df, cov_struct=ind, family=f, weights=df['iptw']).fit() log.summary() #Double-Robust sdr = SimpleDoubleRobust(df, exposure='art', outcome='dead') sdr.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
def _sequential_regression(self, treatment, tmax): """Hidden function that executes the sequential regression estimation for g-formula """ # TODO allow option to include different estimation models for each time point or the same model if treatment == 'natural': # Thoughts: MC estimator needs natural course as a check. This should not apply to SR estimator raise ValueError( 'Natural course estimation is not clear to me with Sequential Regression Estimator. ' 'Therefore, "natural" is not implemented') # If custom treatment, it gets evaluated here g = self.gf if treatment not in ['all', 'none']: g['__indicator'] = np.where(eval(treatment), 1, 0) # Restricting based on tmax argument if tmax is None: pass elif tmax in list(self.gf[self.time_out].unique()): g = g.loc[g[self.time_out] <= tmax].copy() else: warnings.warn( "The t_max argument specifies a time that is not observed in the data. All times less than" "the specified t_max argument included in the estimation procedure", UserWarning) g = g.loc[g[self.time_out] <= tmax].copy() # Converting dataframe from long-to-wide for easier estimation column_labels = list( g.columns ) # Getting all column labels (important to match with formula) df = self._long_to_wide(df=g, id=self.idvar, t=self.time_out) linkdist = sm.families.family.Binomial() rt_points = sorted(list(g[self.time_out].unique()), reverse=True) # Getting all t's to backward loop t_points = sorted(list(g[self.time_out].unique()), reverse=False) # Getting all t's to forward loop # Checking for recurrent outcomes. Recurrent are not currently supported if pd.Series(df[[ self.outcome + '_' + str(t) for t in sorted(t_points, reverse=False) ]].sum(axis=1, skipna=True) > 1).any(): raise ValueError( 'Looks like your data has multiple outcomes. Recurrent outcomes are not currently ' 'supported') # Step 1: Creating indicator for individuals who followed counterfactual outcome treat_t_points = [] for t in t_points: # Following treatment strategy # alternative: if treat all, can do simple multiplication. if treat none, can do (1-A) simple multiplication if treatment == 'all': df['__indicator_' + str(t)] = np.where( df[self.exposure + '_' + str(t)] == 0, 0, np.nan) df['__indicator_' + str(t)] = np.where( df[self.exposure + '_' + str(t)] == 1, 1, df['__indicator_' + str(t)]) elif treatment == 'none': df['__indicator_' + str(t)] = np.where( df[self.exposure + '_' + str(t)] == 0, 1, np.nan) df['__indicator_' + str(t)] = np.where( df[self.exposure + '_' + str(t)] == 1, 0, df['__indicator_' + str(t)]) else: # custom exposure pattern pass treat_t_points.append('__indicator_' + str(t)) df['__check_' + str(t)] = df[treat_t_points + [self.outcome + '_' + str(t)]].prod( axis=1, skipna=True) # This following check carries forward the outcome under the counterfactual treatment if t_points.index(t) == 0: pass else: df['__check_' + str(t)] = np.where( df['__check_' + str(t_points[t_points.index(t) - 1])] == 1, 1, df['__check_' + str(t)]) # Step 2: Sequential Regression Estimation for t in rt_points: # 2.1) Relabel everything to match with the specified model (selecting out that timepoint is within) d_labels = {} for c in column_labels: d_labels[c + '_' + str(t)] = c g = df.filter(regex='_' + str(t)).rename( mapper=d_labels, axis=1).reset_index().copy() g[self.time_out] = t # 2.2) Fit the model to the observed data if rt_points.index(t) == 0: if self._weights is None: m = smf.glm(self.outcome + ' ~ ' + self._modelform, g, family=linkdist).fit() # GLM else: m = smf.gee(self.outcome + ' ~ ' + self._modelform, self.idvar, g, weights=df[self._weights + '_' + str(t)], family=linkdist).fit() # Weighted, so GEE if self._printseqregresults: print(m.summary()) else: # Uses previous predicted values to estimate g[self.outcome] = np.where( df['__pred_' + self.outcome + '_' + str(t_points[t_points.index(t) + 1])].isna(), g[self.outcome], df['__pred_' + self.outcome + '_' + str(t_points[t_points.index(t) + 1])]) if self._weights is None: m = smf.glm(self.outcome + ' ~ ' + self._modelform, g, family=linkdist).fit() # GLM else: m = smf.gee(self.outcome + ' ~ ' + self._modelform, self.idvar, g, weights=df[self._weights + '_' + str(t)], family=linkdist).fit() # Weighted, so GEE if self._printseqregresults: print(m.summary()) # 2.3) Getting Counterfactual Treatment Values if treatment == 'all': g[self.exposure] = 1 elif treatment == 'none': g[self.exposure] = 0 else: g[self.exposure] = np.where(eval(treatment), 1, 0) # Predicted values based on counterfactual treatment strategy from predicted model df['__pred_' + self.outcome + '_' + str(t)] = np.where( df[self.outcome + '_' + str(t)].isna(), np.nan, m.predict(g)) # If followed counterfactual treatment & had outcome, then always considered to have outcome past that t df['__cf_' + self.outcome + '_' + str(t)] = np.where( (df['__check_' + str(t)] == 1), 1, df['__pred_' + self.outcome + '_' + str(t)]) # Step 3) Returning estimated results if self._weights is None: return np.mean(df['__pred_' + self.outcome + '_' + str(t_points[0])]) else: return np.average( df['__pred_' + self.outcome + '_' + str(t_points[0])], weights=df[self._weights + '_' + str(t_points[0])])
def add_covariate_model(self, label, covariate, model, restriction=None, recode=None, var_type='binary', print_results=True): """Add a specified regression model for time-varying confounders. Unlike the exposure and outcome models, a covariate model does NOT have to be specified. Additionally, *n* covariate models can be specified for *n* time-varying covariates. Additional models are added by repeated calls for this function with the corresponding covariates and predictive regression equations This argument is only used for the Monte Carlo g-formula. The sequential regression only requires specification of the outcome model. Parameters ---------- label : int Integer label for the covariate model. Covariate models are fit in ascending order within TimeVaryGFormula covariate : str Column label for time-varying confounder to be predicted model : str Variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format follows patsy For example) 'var1 + var2 + var3 + var4' restriction : str, optional Used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1" recode : str, optional This variable is vitally important for various functional forms implemented later in models. This is used to run some background code to recreate functional forms as the g-formula is estimated via fit() For an example, let's say we have age but we want the functional form to be quadratic. For this, we would set the recode="g['age_sq'] = g['age']**2;" Similar to TimeFixedGFormula, 'g' must be specified as the DataFrame object with the corresponding indexes. Also lines of executable code should end with ';', so Python knows that the line ends there. My apologies for this poor solution... I am working on a better way. In the background, Python executes the code input into recode var_type : str, optional Type of variable that the covariate is. Current options include 'binary' or 'continuous' print_results : bool, optional Whether to print the logistic regression model results to the terminal. Default is True """ if type(label) is not int: raise ValueError('Label must be an integer') # Building predictive model g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() if self._weights is None: # Unweighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial() m = smf.glm(covariate + ' ~ ' + model, g, family=linkdist) elif var_type == 'continuous': linkdist = sm.families.family.Gaussian( sm.families.links.identity) m = smf.gls(covariate + ' ~ ' + model, g) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) else: # Weighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial() m = smf.gee(covariate + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist) elif var_type == 'continuous': linkdist = sm.families.family.Gaussian( sm.families.links.identity) m = smf.gee(covariate + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) f = m.fit() if print_results: print(f.summary()) # Adding to lists, it is used to predict variables later on for the time-varying... self._covariate_models.append(f) self._covariate_model_index.append(label) self._covariate.append(covariate) self._covariate_type.append(var_type) if recode is None: self._covariate_recode.append( 'None') # Must be string for exec() to use later else: self._covariate_recode.append(recode)
for j in range(0, renshu): zu += 1 for shij in range(0, 3): if key[shij] == 'N': zhi = 1 else: zhi = 0 temp = pd.DataFrame([{'周数': shij, '值': zhi, '组': zu}]) temp['诊断严重程度'] = data_temp['诊断严重程度'] temp['治疗'] = data_temp['治疗'] tmp = tmp.append(temp) # tmp.to_csv('D:/结果数据_抑郁症治疗.csv',encoding='gbk') tmp['诊断严重程度'] = tmp['诊断严重程度'].replace({'轻微': 0, '严重': 1}) tmp['治疗'] = tmp['治疗'].replace({'标准': 0, '新药': 1}) tmp = tmp.reset_index() del tmp['index'] va = sm.cov_struct.Autoregressive() fam = sm.families.Binomial() ind = sm.cov_struct.Independence() #与书中结果一致 mod = smf.gee("值 ~ 诊断严重程度 + 治疗 + 周数+治疗:周数", "组", tmp, cov_struct=ind, family=fam) res = mod.fit() res.summary() #2.多元GEE 疑似NominalGEE #3.有序 OrdinalGEE
def add_covariate_model(self, label, covariate, model, restriction=None, recode=None, var_type='binary', print_results=True): """ Build the model for the specified covariate. This is to deal with time-varying confounders. Does NOT have to be specified, unlike the exposure and outcome models. The order in which these models are fit is based on the provided integer labels Input: label: -integer label for the covariate model. Covariate models are fit in ascending order within TimeVaryGFormula covariate: -variable to be predicted model: -variables to include in the model for predicting the outcome. Must be contained within the input pandas dataframe when initialized. Format is the same as the functional form, i.e. 'var1 + var2 + var3 + var4' restriction: -used to restrict the population to fit the logistic regression model to. Useful for Intent-to-Treat model fitting. The pandas dataframe must be referred to as 'g' Example) "g['art']==1" recode: -This variable is vitally important for various functional forms implemented later in models. This is used to run some background code to recreate functional forms as the g-formula is fit via fit() For an example, let's say we have age but we want the functional form to be cubic. For this, we would set the recode="g['']" Similar to TimeFixedGFormula, 'g' must be specified as the data frame object with the corresponding indexes. Also lines of executable code should end with ';', so Python knows that the line ends there. My apologies for this poor solution... I am working on a better way var_type: -type of variable that the covariate is. Current options include 'binary' or 'continuous' print_results: -whether to print the logistic regression results to the terminal. Default is True """ if type(label) is not int: raise ValueError('Label must be an integer') # Building predictive model g = self.gf.copy() if restriction is not None: g = g.loc[eval(restriction)].copy() if self._weights is None: # Unweighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial(sm.families.links.logit) m = smf.glm(covariate + ' ~ ' + model, g, family=linkdist) elif var_type == 'continuous': linkdist = sm.families.family.Gaussian( sm.families.links.identity) m = smf.gls(covariate + ' ~ ' + model, g) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) else: # Weighted g-formula if var_type == 'binary': linkdist = sm.families.family.Binomial(sm.families.links.logit) m = smf.gee(covariate + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist) elif var_type == 'continuous': linkdist = sm.families.family.Gaussian( sm.families.links.identity) m = smf.gee(covariate + ' ~ ' + model, self.idvar, g, weights=g[self._weights], family=linkdist) else: raise ValueError( 'Only binary or continuous covariates are currently supported' ) f = m.fit() if print_results: print(f.summary()) # Adding to lists, it is used to predict variables later on for the time-varying... self._covariate_models.append(f) self._covariate_model_index.append(label) self._covariate.append(covariate) self._covariate_type.append(var_type) if recode is None: self._covariate_recode.append( 'None') # Must be string for exec() to use later else: self._covariate_recode.append(recode)