def test_long_to_wide_conversion(self, longdata): g = TimeVaryGFormula(longdata, idvar='id', exposure='A', outcome='Y', time_out='t', method='SequentialRegression') lf = g._long_to_wide(longdata, id='id', t='t') expected_lf = pd.DataFrame.from_records([{ 'A_0': 0, 'A_1': 1, 'A_2': 1, 'Y_0': 0, 'Y_1': 0, 'Y_2': 1, 'W_0': 5, 'W_1': 5, 'W_2': 5, 'L_0': 25, 'L_1': 20, 'L_2': 31, 'id': 1 }]).set_index('id') pdt.assert_frame_equal(lf, expected_lf[[ 'A_0', 'A_1', 'A_2', 'Y_0', 'Y_1', 'Y_2', 'W_0', 'W_1', 'W_2', 'L_0', 'L_1', 'L_2' ]], check_names=False)
def test_error_covariate_label(self, sim_t_fixed_data): g = TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='Y', time_out='t', time_in='t0') with pytest.raises(ValueError): g.add_covariate_model(label='first', covariate='W1', model='W2')
def test_error_monte_carlo1(self, sim_t_fixed_data): with pytest.raises(ValueError): TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='W1', time_out='t')
def test_error_estimation_method(self, sim_t_fixed_data): with pytest.raises(ValueError): TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='W1', time_out='t', method='Fail')
def test_error_continuous_outcome(self, sim_t_fixed_data): with pytest.raises(ValueError): TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='W1', time_out='t', time_in='t0')
def test_error_sr_other_models(self, sim_t_fixed_data): g = TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='Y', time_out='t', method='SequentialRegression') g.outcome_model('A + W1_sq + W2 + W3', print_results=False) g.exposure_model('W1_sq', print_results=False) with pytest.raises(ValueError): g.fit(treatment='all')
def test_sr_custom_time_point(self, data): g = TimeVaryGFormula(data, idvar='id', exposure='A', outcome='Y', time_out='t', method='SequentialRegression') g.outcome_model('A + L', print_results=False) # values come from R's ltmle package g.fit(treatment="all", t_max=2) npt.assert_allclose(g.predicted_outcomes, 0.33492, rtol=1e-5) g.fit(treatment="none", t_max=2) npt.assert_allclose(g.predicted_outcomes, 0.51228, rtol=1e-5)
def test_sr_gap_time(self, data): g = TimeVaryGFormula(data, idvar='id', exposure='A', outcome='Y', time_out='t2', method='SequentialRegression') out_m = 'A + L' g.outcome_model(out_m, print_results=False) g.fit(treatment="all") npt.assert_allclose(g.predicted_outcomes, 0.4051569, rtol=1e-5) g.fit(treatment="none") npt.assert_allclose(g.predicted_outcomes, 0.661226, rtol=1e-5)
def test_error_treatment_type(self, sim_t_fixed_data): g = TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='Y', time_out='t', time_in='t0') with pytest.raises(ValueError): g.fit(treatment=1) g = TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='Y', time_out='t', method='SequentialRegression') with pytest.raises(ValueError): g.fit(treatment=1)
def test_sr_warning_outside_time_point(self, data): g = TimeVaryGFormula(data, idvar='id', exposure='A', outcome='Y', time_out='t2', method='SequentialRegression') g.outcome_model('A + L', print_results=False) with pytest.warns(UserWarning): g.fit(treatment="all", t_max=6) npt.assert_allclose(g.predicted_outcomes, 0.33492, rtol=1e-5)
def test_sr_custom_treatment(self, data): g = TimeVaryGFormula(data, idvar='id', exposure='A', outcome='Y', time_out='t2', method='SequentialRegression') out_m = 'A + L' g.outcome_model(out_m, print_results=False) g.fit(treatment="g['t'] != 2") npt.assert_allclose(g.predicted_outcomes, 0.48543, rtol=1e-5)
def test_monte_carlo_for_single_t(self, sim_t_fixed_data): # Estimating monte carlo for single t gt = TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='Y', time_out='t', time_in='t0') gt.outcome_model('A + W1_sq + W2 + W3', print_results=False) gt.exposure_model('W1_sq', print_results=False) gt.fit(treatment="all", sample=1000000 ) # Keep this a high number to reduce simulation errors # Estimating with TimeFixedGFormula gf = TimeFixedGFormula(sim_t_fixed_data, exposure='A', outcome='Y') gf.outcome_model(model='A + W1_sq + W2 + W3', print_results=False) gf.fit(treatment='all') # Expected behavior; same results between the estimation methods npt.assert_allclose(gf.marginal_outcome, np.mean(gt.predicted_outcomes['Y']), rtol=1e-3)
def test_error_sr_recurrent_outcomes(self): df = pd.DataFrame() df['id'] = [1, 1, 1] df['Y'] = [0, 1, 1] df['A'] = [0, 1, 1] df['t'] = [0, 1, 2] g = TimeVaryGFormula(df, idvar='id', exposure='A', outcome='Y', time_out='t', method='SequentialRegression') g.outcome_model('A', print_results=False) with pytest.raises(ValueError): g.fit(treatment='all')
def test_sequential_regression_for_single_t(self, sim_t_fixed_data): # Estimating sequential regression for single t gt = TimeVaryGFormula(sim_t_fixed_data, idvar='id', exposure='A', outcome='Y', time_out='t', method='SequentialRegression') gt.outcome_model('A + W1_sq + W2 + W3', print_results=False) gt.fit(treatment="all") # Estimating with TimeFixedGFormula gf = TimeFixedGFormula(sim_t_fixed_data, exposure='A', outcome='Y') gf.outcome_model(model='A + W1_sq + W2 + W3', print_results=False) gf.fit(treatment='all') # Expected behavior; same results between the estimation methods npt.assert_allclose(gf.marginal_outcome, gt.predicted_outcomes)
def mc_gformula_check(): df = load_sample_data(timevary=True) df['lag_art'] = df['art'].shift(1) df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art']) df['lag_cd4'] = df['cd4'].shift(1) df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4']) df['lag_dvl'] = df['dvl'].shift(1) df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl']) df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True) # age spline df['cd40_sq'] = df['cd40'] ** 2 # cd4 baseline cubic df['cd40_cu'] = df['cd40'] ** 3 df['cd4_sq'] = df['cd4'] ** 2 # cd4 current cubic df['cd4_cu'] = df['cd4'] ** 3 df['enter_sq'] = df['enter'] ** 2 # entry time cubic df['enter_cu'] = df['enter'] ** 3 g = TimeVaryGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out') exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.exposure_model(exp_m, restriction="g['lag_art']==0") out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.outcome_model(out_m, restriction="g['drop']==0") dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary') cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") g.add_covariate_model(label=2, covariate='cd4', model=cd4_m,recode=cd4_recode_scheme, var_type='continuous') g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=10000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) gf = g.predicted_outcomes gfs = gf.loc[gf.uid_g_zepid != gf.uid_g_zepid.shift(-1)].copy() kmn = KaplanMeierFitter() kmn.fit(durations=gfs['out'], event_observed=gfs['dead']) kmo = KaplanMeierFitter() kmo.fit(durations=df['out'], event_observed=df['dead'], entry=df['enter']) plt.step(kmn.event_table.index, 1 - kmn.survival_function_, c='g', where='post', label='Natural') plt.step(kmo.event_table.index, 1 - kmo.survival_function_, c='k', where='post', label='True') plt.legend() plt.show()
def test_complete_mc_procedure_completes(self): df = load_sample_data(timevary=True) df['lag_art'] = df['art'].shift(1) df['lag_art'] = np.where( df.groupby('id').cumcount() == 0, 0, df['lag_art']) df['lag_cd4'] = df['cd4'].shift(1) df['lag_cd4'] = np.where( df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4']) df['lag_dvl'] = df['dvl'].shift(1) df['lag_dvl'] = np.where( df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl']) df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True) # age spline df['cd40_sq'] = df['cd40']**2 df['cd40_cu'] = df['cd40']**3 df['cd4_sq'] = df['cd4']**2 df['cd4_cu'] = df['cd4']**3 df['enter_sq'] = df['enter']**2 df['enter_cu'] = df['enter']**3 g = TimeVaryGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out') exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.exposure_model(exp_m, restriction="g['lag_art']==0") out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.outcome_model(out_m, restriction="g['drop']==0") dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary') cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") g.add_covariate_model(label=2, covariate='cd4', model=cd4_m, recode=cd4_recode_scheme, var_type='continuous') g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={ 'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl' }, sample=10000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) assert isinstance(g.predicted_outcomes, type(pd.DataFrame()))