def dml_plr_fixture(generate_data1, learner, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values x = data.loc[:, x_cols].values d = data['d'].values n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds) res_manual = fit_plr(y, x, d, clone(learner), clone(learner), all_smpls, dml_procedure, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual['theta'], 'se': dml_plr_obj.se, 'se_manual': res_manual['se'], 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], res_manual['all_g_hat'], res_manual['all_m_hat'], all_smpls, score, bootstrap, n_rep_boot) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_cluster_with_index(generate_data1, learner, dml_procedure): # in the one-way cluster case with exactly one observation per cluster, we get the same result w & w/o clustering n_folds = 2 # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) np.random.seed(3141) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, dml_procedure=dml_procedure) dml_plr_obj.fit() df = data.reset_index() dml_cluster_data = dml.DoubleMLClusterData(df, y_col='y', d_cols='d', x_cols=x_cols, cluster_cols='index') np.random.seed(3141) dml_plr_cluster_obj = dml.DoubleMLPLR(dml_cluster_data, ml_g, ml_m, n_folds, dml_procedure=dml_procedure) dml_plr_cluster_obj.fit() res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': dml_plr_cluster_obj.coef, 'se': dml_plr_obj.se, 'se_manual': dml_plr_cluster_obj.se } return res_dict
def dml_plr_reestimate_fixture(generate_data1, learner, score, dml_procedure, n_rep): n_folds = 3 # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep, score, dml_procedure) dml_plr_obj.fit() np.random.seed(3141) dml_plr_obj2 = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep, score, dml_procedure) dml_plr_obj2.fit() dml_plr_obj2._coef[0] = np.nan dml_plr_obj2._se[0] = np.nan dml_plr_obj2._est_causal_pars_and_se() res_dict = {'coef': dml_plr_obj.coef, 'coef2': dml_plr_obj2.coef, 'se': dml_plr_obj.se, 'se2': dml_plr_obj2.se} return res_dict
def dml_plr_smpls_fixture(generate_data1, learner, score, dml_procedure, n_rep): n_folds = 3 # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep, score, dml_procedure) dml_plr_obj.fit() smpls = dml_plr_obj.smpls dml_plr_obj2 = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, score=score, dml_procedure=dml_procedure, draw_sample_splitting=False) dml_plr_obj2.set_sample_splitting(smpls) dml_plr_obj2.fit() res_dict = { 'coef': dml_plr_obj.coef, 'coef2': dml_plr_obj2.coef, 'se': dml_plr_obj.se, 'se2': dml_plr_obj2.se } return res_dict
def dml_plr_binary_classifier_fixture(learner, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # Set machine learning methods for m & g ml_g = Lasso() ml_m = clone(learner) np.random.seed(3141) dml_plr_obj = dml.DoubleMLPLR(bonus_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj.fit() np.random.seed(3141) y = bonus_data.y x = bonus_data.x d = bonus_data.d n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds) res_manual = fit_plr(y, x, d, clone(ml_g), clone(ml_m), all_smpls, dml_procedure, score) res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': res_manual['theta'], 'se': dml_plr_obj.se, 'se_manual': res_manual['se'], 'boot_methods': boot_methods} for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], res_manual['all_g_hat'], res_manual['all_m_hat'], all_smpls, score, bootstrap, n_rep_boot) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_pyvsr_fixture(generate_data1, idx, score, dml_procedure): n_folds = 2 n_rep_boot = 483 # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g learner = LinearRegression() ml_g = clone(learner) ml_m = clone(learner) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) #np.random.seed(3141) dml_plr_obj.fit() # fit the DML model in R all_train, all_test = export_smpl_split_to_r(dml_plr_obj.smpls[0]) r_dataframe = pandas2ri.py2rpy(data) res_r = r_MLPLR(r_dataframe, score, dml_procedure, all_train, all_test) res_dict = { 'coef_py': dml_plr_obj.coef, 'coef_r': res_r[0], 'se_py': dml_plr_obj.se, 'se_r': res_r[1] } return res_dict
def dml_plr_rep_no_cross_fit_fixture(generate_data1, idx, learner, score, n_rep): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 498 dml_procedure = 'dml1' # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep, score, dml_procedure, apply_cross_fitting=False) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values X = data.loc[:, X_cols].values d = data['d'].values all_smpls = [] for i_rep in range(n_rep): resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] all_smpls.append(smpls) # adapt to do no-cross-fitting in each repetition all_smpls = [[xx[0]] for xx in all_smpls] thetas = np.zeros(n_rep) ses = np.zeros(n_rep) all_g_hat = list() all_m_hat = list() for i_rep in range(n_rep): smpls = all_smpls[i_rep] g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner), clone(learner), smpls) all_g_hat.append(g_hat) all_m_hat.append(m_hat) thetas[i_rep], ses[i_rep] = plr_dml1(y, X, d, all_g_hat[i_rep], all_m_hat[i_rep], smpls, score) res_manual = np.median(thetas) se_manual = np.sqrt( np.median( np.power(ses, 2) * len(smpls[0][1]) + np.power(thetas - res_manual, 2)) / len(smpls[0][1])) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) all_boot_theta = list() all_boot_t_stat = list() for i_rep in range(n_rep): smpls = all_smpls[i_rep] boot_theta, boot_t_stat = boot_plr(thetas[i_rep], y, d, all_g_hat[i_rep], all_m_hat[i_rep], smpls, score, ses[i_rep], bootstrap, n_rep_boot, dml_procedure, apply_cross_fitting=False) all_boot_theta.append(boot_theta) all_boot_t_stat.append(boot_t_stat) boot_theta = np.hstack(all_boot_theta) boot_t_stat = np.hstack(all_boot_t_stat) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_fixture(generate_data1, learner, score, dml_procedure, n_rep): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 498 # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep, score, dml_procedure) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values x = data.loc[:, x_cols].values d = data['d'].values n_obs = len(y) all_smpls = [] for i_rep in range(n_rep): resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(x)] all_smpls.append(smpls) thetas = np.zeros(n_rep) ses = np.zeros(n_rep) all_g_hat = list() all_m_hat = list() for i_rep in range(n_rep): smpls = all_smpls[i_rep] g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(learner), clone(learner), smpls) all_g_hat.append(g_hat) all_m_hat.append(m_hat) if dml_procedure == 'dml1': thetas[i_rep], ses[i_rep] = plr_dml1(y, x, d, all_g_hat[i_rep], all_m_hat[i_rep], smpls, score) else: assert dml_procedure == 'dml2' thetas[i_rep], ses[i_rep] = plr_dml2(y, x, d, all_g_hat[i_rep], all_m_hat[i_rep], smpls, score) res_manual = np.median(thetas) se_manual = np.sqrt(np.median(np.power(ses, 2)*n_obs + np.power(thetas - res_manual, 2))/n_obs) res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) all_boot_theta = list() all_boot_t_stat = list() for i_rep in range(n_rep): smpls = all_smpls[i_rep] boot_theta, boot_t_stat = boot_plr(thetas[i_rep], y, d, all_g_hat[i_rep], all_m_hat[i_rep], smpls, score, ses[i_rep], bootstrap, n_rep_boot, dml_procedure) all_boot_theta.append(boot_theta) all_boot_t_stat.append(boot_t_stat) boot_theta = np.hstack(all_boot_theta) boot_t_stat = np.hstack(all_boot_t_stat) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
dml_data = dml.datasets.fetch_bonus() dml_data.data.head() # %% # Specify learner and estimate causal parameter: PLR model with random forest as learner # -------------------------------------------------------------------------------------- # Set machine learning methods for m & g ml_g = RandomForestRegressor() ml_m = RandomForestRegressor() n_folds = 2 n_rep = 10 np.random.seed(3141) dml_plr_rf = dml.DoubleMLPLR(dml_data, ml_g, ml_m, n_folds, n_rep, 'partialling out', 'dml2') # set some hyperparameters for the learners pars = {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 5} dml_plr_rf.set_ml_nuisance_params('ml_g', 'tg', pars) dml_plr_rf.set_ml_nuisance_params('ml_m', 'tg', pars) # %% # dml_plr_rf.fit() dml_plr_rf.summary # %% #
def dml_plr_no_cross_fit_tune_fixture(generate_data1, learner, score, tune_on_folds): par_grid = { 'ml_g': { 'alpha': np.linspace(0.05, .95, 7) }, 'ml_m': { 'alpha': np.linspace(0.05, .95, 7) } } n_folds_tune = 3 boot_methods = ['normal'] n_rep_boot = 502 dml_procedure = 'dml1' # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = Lasso() ml_m = Lasso() np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds=2, score=score, dml_procedure=dml_procedure, apply_cross_fitting=False) # tune hyperparameters _ = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune) # fit with tuned parameters dml_plr_obj.fit() np.random.seed(3141) y = obj_dml_data.y x = obj_dml_data.x d = obj_dml_data.d n_obs = len(y) all_smpls = draw_smpls(n_obs, 2) smpls = all_smpls[0] smpls = [smpls[0]] if tune_on_folds: g_params, m_params = tune_nuisance_plr(y, x, d, clone(ml_g), clone(ml_m), smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) else: xx = [(np.arange(len(y)), np.array([]))] g_params, m_params = tune_nuisance_plr(y, x, d, clone(ml_g), clone(ml_m), xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) res_manual = fit_plr(y, x, d, clone(ml_m), clone(ml_g), [smpls], dml_procedure, score, g_params=g_params, m_params=m_params) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual['theta'], 'se': dml_plr_obj.se, 'se_manual': res_manual['se'], 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], res_manual['all_g_hat'], res_manual['all_m_hat'], [smpls], score, bootstrap, n_rep_boot, apply_cross_fitting=False) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_fixture(generate_data2, learner_g, learner_m, score, dml_procedure, tune_on_folds): par_grid = { 'ml_g': get_par_grid(learner_g), 'ml_m': get_par_grid(learner_m) } n_folds_tune = 4 boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # collect data obj_dml_data = generate_data2 # Set machine learning methods for m & g ml_g = clone(learner_g) ml_m = clone(learner_m) np.random.seed(3141) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) # tune hyperparameters _ = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune) # fit with tuned parameters dml_plr_obj.fit() np.random.seed(3141) y = obj_dml_data.y x = obj_dml_data.x d = obj_dml_data.d resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(x)] if tune_on_folds: g_params, m_params = tune_nuisance_plr(y, x, d, clone(learner_m), clone(learner_g), smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(learner_m), clone(learner_g), smpls, g_params, m_params) else: xx = [(np.arange(len(y)), np.array([]))] g_params, m_params = tune_nuisance_plr(y, x, d, clone(learner_m), clone(learner_g), xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(learner_m), clone(learner_g), smpls, g_params * n_folds, m_params * n_folds) if dml_procedure == 'dml1': res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score) else: assert dml_procedure == 'dml2' res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_ols_manual_fixture(generate_data1, score, dml_procedure): learner = LinearRegression() boot_methods = ['Bayes', 'normal', 'wild'] n_folds = 2 n_rep_boot = 501 # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) n = data.shape[0] this_smpl = list() xx = int(n / 2) this_smpl.append((np.arange(xx, n), np.arange(0, xx))) this_smpl.append((np.arange(0, xx), np.arange(xx, n))) smpls = [this_smpl] dml_plr_obj.set_sample_splitting(smpls) dml_plr_obj.fit() y = data['y'].values x = data.loc[:, x_cols].values d = data['d'].values # add column of ones for intercept o = np.ones((n, 1)) x = np.append(x, o, axis=1) smpls = dml_plr_obj.smpls[0] g_hat = [] for (train_index, test_index) in smpls: ols_est = scipy.linalg.lstsq(x[train_index], y[train_index])[0] g_hat.append(np.dot(x[test_index], ols_est)) m_hat = [] for (train_index, test_index) in smpls: ols_est = scipy.linalg.lstsq(x[train_index], d[train_index])[0] m_hat.append(np.dot(x[test_index], ols_est)) if dml_procedure == 'dml1': res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score) else: assert dml_procedure == 'dml2' res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(y, d, [res_manual], [se_manual], [g_hat], [m_hat], [smpls], score, bootstrap, n_rep_boot) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_fixture(generate_data1, learner, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # collect data data = generate_data1 x_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values x = data.loc[:, x_cols].values d = data['d'].values resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(x)] g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(learner), clone(learner), smpls) if dml_procedure == 'dml1': res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score) else: assert dml_procedure == 'dml2' res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score) res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods} for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_fixture(generate_data2, learner_g, learner_m, score, dml_procedure, tune_on_folds): par_grid = {'ml_g': get_par_grid(learner_g), 'ml_m': get_par_grid(learner_m)} n_folds_tune = 4 boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # collect data obj_dml_data = generate_data2 # Set machine learning methods for m & g ml_g = clone(learner_g) ml_m = clone(learner_m) np.random.seed(3141) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) # tune hyperparameters _ = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune) # fit with tuned parameters dml_plr_obj.fit() np.random.seed(3141) y = obj_dml_data.y x = obj_dml_data.x d = obj_dml_data.d n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds) smpls = all_smpls[0] if tune_on_folds: g_params, m_params = tune_nuisance_plr(y, x, d, clone(learner_g), clone(learner_m), smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) else: xx = [(np.arange(len(y)), np.array([]))] g_params, m_params = tune_nuisance_plr(y, x, d, clone(learner_g), clone(learner_m), xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_params = g_params * n_folds m_params = m_params * n_folds res_manual = fit_plr(y, x, d, clone(learner_g), clone(learner_m), all_smpls, dml_procedure, score, g_params=g_params, m_params=m_params) res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': res_manual['theta'], 'se': dml_plr_obj.se, 'se_manual': res_manual['se'], 'boot_methods': boot_methods} for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], res_manual['all_g_hat'], res_manual['all_m_hat'], all_smpls, score, bootstrap, n_rep_boot) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_no_cross_fit_tune_fixture(generate_data1, idx, learner, score, tune_on_folds): par_grid = { 'ml_g': { 'alpha': np.linspace(0.05, .95, 7) }, 'ml_m': { 'alpha': np.linspace(0.05, .95, 7) } } n_folds_tune = 3 boot_methods = ['normal'] n_rep_boot = 502 dml_procedure = 'dml1' # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = Lasso() ml_m = Lasso() np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds=2, score=score, dml_procedure=dml_procedure, apply_cross_fitting=False) # tune hyperparameters res_tuning = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune) # fit with tuned parameters dml_plr_obj.fit() np.random.seed(3141) y = obj_dml_data.y X = obj_dml_data.x d = obj_dml_data.d resampling = KFold(n_splits=2, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] smpls = [smpls[0]] if tune_on_folds: g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), smpls, g_params, m_params) else: xx = [(np.arange(len(y)), np.array([]))] g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), smpls, g_params, m_params) assert dml_procedure == 'dml1' res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure, apply_cross_fitting=False) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_no_cross_fit_fixture(generate_data1, idx, learner, score, n_folds): boot_methods = ['normal'] n_rep_boot = 502 dml_procedure = 'dml1' # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure, apply_cross_fitting=False) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values X = data.loc[:, X_cols].values d = data['d'].values if n_folds == 1: smpls = [(np.arange(len(y)), np.arange(len(y)))] else: resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] smpls = [smpls[0]] g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner), clone(learner), smpls) assert dml_procedure == 'dml1' res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure, apply_cross_fitting=False) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_multitreat_fixture(generate_data_bivariate, generate_data_toeplitz, idx, learner, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 483 # collect data if idx < n_datasets: data = generate_data_bivariate[idx] else: data = generate_data_toeplitz[idx-n_datasets] X_cols = data.columns[data.columns.str.startswith('X')].tolist() d_cols = data.columns[data.columns.str.startswith('d')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', d_cols, X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values X = data.loc[:, X_cols].values d = data.loc[:, d_cols].values resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] n_d = d.shape[1] coef_manual = np.full(n_d, np.nan) se_manual = np.full(n_d, np.nan) all_g_hat = [] all_m_hat = [] for i_d in range(n_d): Xd = np.hstack((X, np.delete(d, i_d , axis=1))) g_hat, m_hat = fit_nuisance_plr(y, Xd, d[:, i_d], clone(learner), clone(learner), smpls) all_g_hat.append(g_hat) all_m_hat.append(m_hat) if dml_procedure == 'dml1': coef_manual[i_d], se_manual[i_d] = plr_dml1(y, Xd, d[:, i_d], g_hat, m_hat, smpls, score) elif dml_procedure == 'dml2': coef_manual[i_d], se_manual[i_d] = plr_dml2(y, Xd, d[:, i_d], g_hat, m_hat, smpls, score) res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': coef_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods} for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(coef_manual, y, d, all_g_hat, all_m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_fixture(generate_data1, idx, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() alpha = 0.05 learner = Lasso(alpha=alpha) # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d']) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj.fit() np.random.seed(3141) learner = Lasso() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) dml_plr_obj_ext_set_par = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj_ext_set_par.set_ml_nuisance_params('ml_g', 'd', {'alpha': alpha}) dml_plr_obj_ext_set_par.set_ml_nuisance_params('ml_m', 'd', {'alpha': alpha}) dml_plr_obj_ext_set_par.fit() res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': dml_plr_obj_ext_set_par.coef, 'se': dml_plr_obj.se, 'se_manual': dml_plr_obj_ext_set_par.se, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(314122) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat np.random.seed(314122) dml_plr_obj_ext_set_par.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap + '_manual'] = dml_plr_obj_ext_set_par.boot_coef res_dict['boot_t_stat' + bootstrap + '_manual'] = dml_plr_obj_ext_set_par.boot_t_stat return res_dict
def dml_plr_binary_classifier_fixture(learner, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # Set machine learning methods for m & g ml_g = Lasso() ml_m = clone(learner) np.random.seed(3141) dml_plr_obj = dml.DoubleMLPLR(bonus_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj.fit() np.random.seed(3141) y = bonus_data.y x = bonus_data.x d = bonus_data.d resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(x)] if is_classifier(ml_m): g_hat, m_hat = fit_nuisance_plr_classifier(y, x, d, clone(ml_m), clone(ml_g), smpls) else: g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(ml_m), clone(ml_g), smpls) if dml_procedure == 'dml1': res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score) else: assert dml_procedure == 'dml2' res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict