def test_error_for_bad_nknots(self, spline_data): with pytest.raises(ValueError): spline_data['sp'] = spline(spline_data, 'v', n_knots=1.5) with pytest.raises(ValueError): spline_data['sp'] = spline(spline_data, 'v', n_knots=0) with pytest.raises(ValueError): spline_data['sp'] = spline(spline_data, 'v', n_knots=-1) with pytest.raises(ValueError): spline_data['sp'] = spline(spline_data, 'v', n_knots=8)
def test_error_for_unequal_numbers(self, spline_data): with pytest.raises(ValueError): spline_data['sp'] = spline(spline_data, 'v', n_knots=1, knots=[1, 3]) with pytest.raises(ValueError): spline_data['sp'] = spline(spline_data, 'v', n_knots=3, knots=[1, 3])
def mcf(self): df = ze.load_sample_data(False) df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df, 'cd40', n_knots=3, term=2, restricted=True) df[['age_rs1', 'age_rs2']] = ze.spline(df, 'age0', n_knots=3, term=2, restricted=True) return df.drop(columns=['dead'])
def sdata(): df = load_sample_data(False) df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True) df[['age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=3, term=2, restricted=True) return df.drop(columns=['cd4_wk45'])
def causal_check(): # Check IPTW plots data = load_sample_data(False) data[['cd4_rs1', 'cd4_rs2']] = spline(data, 'cd40', n_knots=3, term=2, restricted=True) data[['age_rs1', 'age_rs2']] = spline(data, 'age0', n_knots=3, term=2, restricted=True) ipt = IPTW(data, treatment='art', stabilized=True) ipt.regression_models( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') ipt.fit() ipt.plot_love() plt.tight_layout() plt.show() ipt.plot_kde() plt.show() ipt.plot_kde(measure='logit') plt.show() ipt.plot_boxplot() plt.show() ipt.plot_boxplot(measure='logit') plt.show() # Check SurvivalGFormula plots df = load_sample_data(False).drop(columns=['cd4_wk45']) df['t'] = np.round(df['t']).astype(int) df = pd.DataFrame(np.repeat(df.values, df['t'], axis=0), columns=df.columns) df['t'] = df.groupby('id')['t'].cumcount() + 1 df.loc[((df['dead'] == 1) & (df['id'] != df['id'].shift(-1))), 'd'] = 1 df['d'] = df['d'].fillna(0) df['t_sq'] = df['t']**2 df['t_cu'] = df['t']**3 sgf = SurvivalGFormula(df, idvar='id', exposure='art', outcome='d', time='t') sgf.outcome_model( model='art + male + age0 + cd40 + dvl0 + t + t_sq + t_cu') sgf.fit(treatment='all') sgf.plot() plt.show() sgf.plot(c='r', linewidth=3, alpha=0.8) plt.show()
def test_higher_order_spline(self, spline_data): spline_data[['sp1', 'sp2']] = spline(spline_data, 'v', n_knots=2, knots=[10, 16], term=3.7, restricted=False) expected_splines = pd.DataFrame.from_records([{ 'sp1': 0.0, 'sp2': 0.0 }, { 'sp1': 0.0, 'sp2': 0.0 }, { 'sp1': 0.0, 'sp2': 0.0 }, { 'sp1': 5.0**3.7, 'sp2': 0.0 }, { 'sp1': 10.0**3.7, 'sp2': 4.0**3.7 }]) pdt.assert_frame_equal(spline_data[['sp1', 'sp2']], expected_splines[['sp1', 'sp2']])
def test_auto_knots1(self, spline_data): spline_data['sp'] = spline(spline_data, 'v', n_knots=1, restricted=False) expected_splines = pd.DataFrame.from_records([{'sp': 0.0}, {'sp': 0.0}, {'sp': 0.0}, {'sp': 5.0}, {'sp': 10.0}]) pdt.assert_series_equal(spline_data['sp'], expected_splines['sp'])
def test_restricted_spline3(self, spline_data): spline_data['rsp'] = spline(spline_data, 'v', n_knots=2, knots=[5, 16], term=2, restricted=True) expected_splines = pd.DataFrame.from_records([{'rsp': 0.0}, {'rsp': 0.0}, {'rsp': (10.0 - 5.0)**2 - 0}, {'rsp': (15.0 - 5.0)**2 - 0}, {'rsp': (20.0 - 5.0)**2 - (20.0 - 16.0)**2}]) pdt.assert_series_equal(spline_data['rsp'], expected_splines['rsp'])
def test_restricted_spline1(self, spline_data): spline_data['rsp'] = spline(spline_data, 'v', n_knots=2, knots=[10, 16], restricted=True) expected_splines = pd.DataFrame.from_records([{'rsp': 0.0}, {'rsp': 0.0}, {'rsp': 0.0}, {'rsp': 5.0}, {'rsp': 6.0}]) pdt.assert_series_equal(spline_data['rsp'], expected_splines['rsp'])
def test_cubic_spline1(self, spline_data): spline_data['sp'] = spline(spline_data, 'v', n_knots=1, knots=[16], term=3, restricted=False) expected_splines = pd.DataFrame.from_records([{'sp': 0.0}, {'sp': 0.0}, {'sp': 0.0}, {'sp': 0.0}, {'sp': 4.0**3}]) pdt.assert_series_equal(spline_data['sp'], expected_splines['sp'])
def test_auto_knots2(self, spline_data): spline_data[['sp1', 'sp2']] = spline(spline_data, 'v', n_knots=2, restricted=False) expected_splines = pd.DataFrame.from_records([{'sp1': 0.0, 'sp2': 0.0}, {'sp1': 0.0, 'sp2': 0.0}, {'sp1': 10 - 20/3, 'sp2': 0.0}, {'sp1': 15 - 20/3, 'sp2': 15 - 40/3}, {'sp1': 20 - 20/3, 'sp2': 20 - 40/3}]) pdt.assert_frame_equal(spline_data[['sp1', 'sp2']], expected_splines[['sp1', 'sp2']])
def causal_check(): # 9) Check IPTW plots data = load_sample_data(False) data[['cd4_rs1', 'cd4_rs2']] = spline(data, 'cd40', n_knots=3, term=2, restricted=True) data[['age_rs1', 'age_rs2']] = spline(data, 'age0', n_knots=3, term=2, restricted=True) ipt = IPTW(data, treatment='art', stabilized=True) ipt.regression_models('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') ipt.fit() ipt.plot_love() plt.tight_layout() plt.show() ipt.plot_kde() plt.show() ipt.plot_kde(measure='logit') plt.show() ipt.plot_boxplot() plt.show() ipt.plot_boxplot(measure='logit') plt.show()
def mc_gformula_check(): df = load_sample_data(timevary=True) df['lag_art'] = df['art'].shift(1) df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art']) df['lag_cd4'] = df['cd4'].shift(1) df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4']) df['lag_dvl'] = df['dvl'].shift(1) df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl']) df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True) # age spline df['cd40_sq'] = df['cd40'] ** 2 # cd4 baseline cubic df['cd40_cu'] = df['cd40'] ** 3 df['cd4_sq'] = df['cd4'] ** 2 # cd4 current cubic df['cd4_cu'] = df['cd4'] ** 3 df['enter_sq'] = df['enter'] ** 2 # entry time cubic df['enter_cu'] = df['enter'] ** 3 g = TimeVaryGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out') exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.exposure_model(exp_m, restriction="g['lag_art']==0") out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.outcome_model(out_m, restriction="g['drop']==0") dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary') cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") g.add_covariate_model(label=2, covariate='cd4', model=cd4_m,recode=cd4_recode_scheme, var_type='continuous') g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=10000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) gf = g.predicted_outcomes gfs = gf.loc[gf.uid_g_zepid != gf.uid_g_zepid.shift(-1)].copy() kmn = KaplanMeierFitter() kmn.fit(durations=gfs['out'], event_observed=gfs['dead']) kmo = KaplanMeierFitter() kmo.fit(durations=df['out'], event_observed=df['dead'], entry=df['enter']) plt.step(kmn.event_table.index, 1 - kmn.survival_function_, c='g', where='post', label='Natural') plt.step(kmo.event_table.index, 1 - kmo.survival_function_, c='k', where='post', label='True') plt.legend() plt.show()
def test_complete_mc_procedure_completes(self): df = load_sample_data(timevary=True) df['lag_art'] = df['art'].shift(1) df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art']) df['lag_cd4'] = df['cd4'].shift(1) df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4']) df['lag_dvl'] = df['dvl'].shift(1) df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl']) df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True) # age spline df['cd40_sq'] = df['cd40'] ** 2 df['cd40_cu'] = df['cd40'] ** 3 df['cd4_sq'] = df['cd4'] ** 2 df['cd4_cu'] = df['cd4'] ** 3 df['enter_sq'] = df['enter'] ** 2 df['enter_cu'] = df['enter'] ** 3 g = MonteCarloGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out') exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.exposure_model(exp_m, restriction="g['lag_art']==0") out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.outcome_model(out_m, restriction="g['drop']==0") dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary') cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") g.add_covariate_model(label=2, covariate='cd4', model=cd4_m, recode=cd4_recode_scheme, var_type='continuous') cens_m = """male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu""" g.censoring_model(cens_m) g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=5000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) assert isinstance(g.predicted_outcomes, type(pd.DataFrame()))
######################################### # Causal Survival Analysis from zepid import load_sample_data, spline from zepid.causal.gformula import SurvivalGFormula df = load_sample_data(False).drop(columns=['cd4_wk45']) df['t'] = np.round(df['t']).astype(int) df = pd.DataFrame(np.repeat(df.values, df['t'], axis=0), columns=df.columns) df['t'] = df.groupby('id')['t'].cumcount() + 1 df.loc[((df['dead'] == 1) & (df['id'] != df['id'].shift(-1))), 'd'] = 1 df['d'] = df['d'].fillna(0) # Spline terms df[['t_rs1', 't_rs2', 't_rs3']] = spline(df, 't', n_knots=4, term=2, restricted=True) df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True) df[['age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=3, term=2, restricted=True) sgf = SurvivalGFormula(df.drop(columns=['dead']), idvar='id',
def causal_check(): data = load_sample_data(False).drop(columns=['cd4_wk45']) data[['cd4_rs1', 'cd4_rs2']] = spline(data, 'cd40', n_knots=3, term=2, restricted=True) data[['age_rs1', 'age_rs2']] = spline(data, 'age0', n_knots=3, term=2, restricted=True) # Check TimeFixedGFormula diagnostics g = TimeFixedGFormula(data, exposure='art', outcome='dead') g.outcome_model( model= 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) g.run_diagnostics(decimal=3) # Check IPTW plots ipt = IPTW(data, treatment='art', outcome='dead') ipt.treatment_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', stabilized=True) ipt.marginal_structural_model('art') ipt.fit() ipt.plot_love() plt.tight_layout() plt.show() ipt.plot_kde() plt.show() ipt.plot_kde(measure='logit') plt.show() ipt.plot_boxplot() plt.show() ipt.plot_boxplot(measure='logit') plt.show() ipt.run_diagnostics() # Check AIPTW Diagnostics aipw = AIPTW(data, exposure='art', outcome='dead') aipw.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') aipw.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) aipw.fit() aipw.run_diagnostics() aipw.plot_kde(to_plot='exposure') plt.show() aipw.plot_kde(to_plot='outcome') plt.show() aipw.plot_love() plt.show() # Check TMLE diagnostics tmle = TMLE(data, exposure='art', outcome='dead') tmle.exposure_model( 'male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') tmle.outcome_model( 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0' ) tmle.fit() tmle.run_diagnostics() tmle.plot_kde(to_plot='exposure') plt.show() tmle.plot_kde(to_plot='outcome') plt.show() tmle.plot_love() plt.show() # Check SurvivalGFormula plots df = load_sample_data(False).drop(columns=['cd4_wk45']) df['t'] = np.round(df['t']).astype(int) df = pd.DataFrame(np.repeat(df.values, df['t'], axis=0), columns=df.columns) df['t'] = df.groupby('id')['t'].cumcount() + 1 df.loc[((df['dead'] == 1) & (df['id'] != df['id'].shift(-1))), 'd'] = 1 df['d'] = df['d'].fillna(0) df['t_sq'] = df['t']**2 df['t_cu'] = df['t']**3 sgf = SurvivalGFormula(df, idvar='id', exposure='art', outcome='d', time='t') sgf.outcome_model( model='art + male + age0 + cd40 + dvl0 + t + t_sq + t_cu') sgf.fit(treatment='all') sgf.plot() plt.show() sgf.plot(c='r', linewidth=3, alpha=0.8) plt.show()
import numpy as np import statsmodels.api as sm import statsmodels.formula.api as smf from statsmodels.genmod.families import family, links import matplotlib.pyplot as plt import zepid as ze from zepid.causal.gformula import TimeFixedGFormula from zepid.causal.doublyrobust import SimpleDoubleRobust df = ze.load_sample_data(timevary=False) df[['age_rs1', 'age_rs2']] = ze.spline(df, 'age0', term=2, restricted=True) df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df, 'cd40', term=2, restricted=True) #Crude Model ze.RiskRatio(df, exposure='art', outcome='dead') ze.RiskDiff(df, exposure='art', outcome='dead') #Adjusted Model model = 'art + male + age0 + cd40 + dvl0' f = sm.families.family.Binomial(sm.families.links.identity) linrisk = smf.glm('dead ~ ' + model, df, family=f).fit() linrisk.summary() f = sm.families.family.Binomial(sm.families.links.log) log = smf.glm('dead ~ art', df, family=f).fit() log.summary() #g-formula g = TimeFixedGFormula(df, exposure='art', outcome='dead') g.outcome_model( model= 'art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') g.fit(treatment='all')
def test_error_for_bad_order(self, spline_data): with pytest.raises(ValueError): spline_data['sp'] = spline(spline_data, 'v', n_knots=3, knots=[3, 1, 2])
import warnings import numpy as np import pandas as pd import statsmodels.api as sm from zepid import load_sample_data, spline ####################################################################################################################### # Binary Outcome ####################################################################################################################### df = load_sample_data(timevary=False) df = df.drop(columns=['cd4_wk45']) df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True) df[['age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=3, term=2, restricted=True) ############################# # Naive Risk Difference from zepid import RiskDifference rd = RiskDifference() rd.fit(df, exposure='art', outcome='dead') rd.summary() ############################# # G-formula from zepid.causal.gformula import TimeFixedGFormula g = TimeFixedGFormula(df, exposure='art', outcome='dead') g.outcome_model(model='art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0',