def test_complete_mc_procedure_completes(self): df = load_sample_data(timevary=True) df['lag_art'] = df['art'].shift(1) df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art']) df['lag_cd4'] = df['cd4'].shift(1) df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4']) df['lag_dvl'] = df['dvl'].shift(1) df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl']) df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True) # age spline df['cd40_sq'] = df['cd40'] ** 2 df['cd40_cu'] = df['cd40'] ** 3 df['cd4_sq'] = df['cd4'] ** 2 df['cd4_cu'] = df['cd4'] ** 3 df['enter_sq'] = df['enter'] ** 2 df['enter_cu'] = df['enter'] ** 3 g = MonteCarloGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out') exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.exposure_model(exp_m, restriction="g['lag_art']==0") out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.outcome_model(out_m, restriction="g['drop']==0") dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary') cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") g.add_covariate_model(label=2, covariate='cd4', model=cd4_m, recode=cd4_recode_scheme, var_type='continuous') cens_m = """male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu""" g.censoring_model(cens_m) g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=5000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) assert isinstance(g.predicted_outcomes, type(pd.DataFrame()))
def mc_gformula_check(): df = load_sample_data(timevary=True) df['lag_art'] = df['art'].shift(1) df['lag_art'] = np.where(df.groupby('id').cumcount() == 0, 0, df['lag_art']) df['lag_cd4'] = df['cd4'].shift(1) df['lag_cd4'] = np.where(df.groupby('id').cumcount() == 0, df['cd40'], df['lag_cd4']) df['lag_dvl'] = df['dvl'].shift(1) df['lag_dvl'] = np.where(df.groupby('id').cumcount() == 0, df['dvl0'], df['lag_dvl']) df[['age_rs0', 'age_rs1', 'age_rs2']] = spline(df, 'age0', n_knots=4, term=2, restricted=True) # age spline df['cd40_sq'] = df['cd40'] ** 2 # cd4 baseline cubic df['cd40_cu'] = df['cd40'] ** 3 df['cd4_sq'] = df['cd4'] ** 2 # cd4 current cubic df['cd4_cu'] = df['cd4'] ** 3 df['enter_sq'] = df['enter'] ** 2 # entry time cubic df['enter_cu'] = df['enter'] ** 3 g = MonteCarloGFormula(df, idvar='id', exposure='art', outcome='dead', time_in='enter', time_out='out') exp_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.exposure_model(exp_m, restriction="g['lag_art']==0") out_m = '''art + male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + cd4 + cd4_sq + cd4_cu + dvl + enter + enter_sq + enter_cu''' g.outcome_model(out_m, restriction="g['drop']==0") dvl_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' g.add_covariate_model(label=1, covariate='dvl', model=dvl_m, var_type='binary') cd4_m = '''male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu''' cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'],1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") g.add_covariate_model(label=2, covariate='cd4', model=cd4_m,recode=cd4_recode_scheme, var_type='continuous') g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=10000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) gf = g.predicted_outcomes kmn = KaplanMeierFitter() kmn.fit(durations=gf['out'], event_observed=gf['dead']) kmo = KaplanMeierFitter() kmo.fit(durations=df['out'], event_observed=df['dead'], entry=df['enter']) cens_m = """male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + lag_dvl + lag_art + enter + enter_sq + enter_cu""" g.censoring_model(cens_m) g.fit(treatment="((g['art']==1) | (g['lag_art']==1))", lags={'art': 'lag_art', 'cd4': 'lag_cd4', 'dvl': 'lag_dvl'}, sample=10000, t_max=None, in_recode=("g['enter_sq'] = g['enter']**2;" "g['enter_cu'] = g['enter']**3")) gf = g.predicted_outcomes kmc = KaplanMeierFitter() kmc.fit(durations=gf['out'], event_observed=gf['dead']) plt.step(kmn.event_table.index, 1 - kmn.survival_function_, c='g', where='post', label='Natural') plt.step(kmn.event_table.index, 1 - kmc.survival_function_, c='orange', where='post', label='Censor') plt.step(kmo.event_table.index, 1 - kmo.survival_function_, c='k', where='post', label='True') plt.legend() plt.show()
cd4_recode_scheme = ("g['cd4'] = np.maximum(g['cd4'], 1);" "g['cd4_sq'] = g['cd4']**2;" "g['cd4_cu'] = g['cd4']**3") mcgf.add_covariate_model( label=2, # Order to fit time-varying models in covariate='cd4', # Time-varying confounder model=cd4_m, print_results=False, recode= cd4_recode_scheme, # Recoding process to use for each iteraction of MCMC var_type='continuous') # Variable type # Pooled Logistic Model: Censoring cens_m = ( "male + age0 + age_rs0 + age_rs1 + age_rs2 + cd40 + cd40_sq + cd40_cu + dvl0 + lag_cd4 + " + "lag_dvl + lag_art + enter + enter_sq + enter_cu") mcgf.censoring_model(cens_m, print_results=False) mcgf.fit( treatment="((g['art']==1) | (g['lag_art']==1))", # Treatment plan lags={ 'art': 'lag_art', # Lagged variables to create each loop 'cd4': 'lag_cd4', 'dvl': 'lag_dvl' }, in_recode=( "g['enter_sq'] = g['enter']**2;" # Recode statement to execute at the start "g['enter_cu'] = g['enter']**3"), sample=20000 ) # Number of resamples from population (should be large number) # Accessing predicted outcome values