Exemplo n.º 1
0
def dml_plr_fixture(generate_data1, learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_plr(y, x, d, clone(learner), clone(learner), all_smpls,
                         dml_procedure, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_plr_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'],
                                           res_manual['ses'],
                                           res_manual['all_g_hat'],
                                           res_manual['all_m_hat'], all_smpls,
                                           score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_pliv_multiway_cluster_old_vs_new_fixture(generate_data_iv, learner):
    n_folds = 3
    dml_procedure = 'dml1'  # same results are only obtained for dml1

    np.random.seed(3141)
    smpl_sizes = [N, M]
    obj_dml_multiway_resampling = DoubleMLMultiwayResampling(
        n_folds, smpl_sizes)
    _, smpls_lin_ind = obj_dml_multiway_resampling.split_samples()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    df = obj_dml_cluster_data.data.set_index(
        ['cluster_var_i', 'cluster_var_j'])
    obj_dml_data = dml.DoubleMLData(df,
                                    y_col=obj_dml_cluster_data.y_col,
                                    d_cols=obj_dml_cluster_data.d_cols,
                                    x_cols=obj_dml_cluster_data.x_cols,
                                    z_cols=obj_dml_cluster_data.z_cols)

    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    draw_sample_splitting=False)
    dml_pliv_obj.set_sample_splitting(smpls_lin_ind)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    dml_pliv_obj_cluster = dml.DoubleMLPLIV(obj_dml_cluster_data,
                                            ml_g,
                                            ml_m,
                                            ml_r,
                                            n_folds,
                                            dml_procedure=dml_procedure)
    dml_pliv_obj_cluster.fit()

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': dml_pliv_obj_cluster.coef
    }

    return res_dict
def dml_plr_cluster_with_index(generate_data1, learner, dml_procedure):
    # in the one-way cluster case with exactly one observation per cluster, we get the same result w & w/o clustering
    n_folds = 2

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    np.random.seed(3141)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  dml_procedure=dml_procedure)
    dml_plr_obj.fit()

    df = data.reset_index()
    dml_cluster_data = dml.DoubleMLClusterData(df,
                                               y_col='y',
                                               d_cols='d',
                                               x_cols=x_cols,
                                               cluster_cols='index')
    np.random.seed(3141)
    dml_plr_cluster_obj = dml.DoubleMLPLR(dml_cluster_data,
                                          ml_g,
                                          ml_m,
                                          n_folds,
                                          dml_procedure=dml_procedure)
    dml_plr_cluster_obj.fit()

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': dml_plr_cluster_obj.coef,
        'se': dml_plr_obj.se,
        'se_manual': dml_plr_cluster_obj.se
    }

    return res_dict
Exemplo n.º 4
0
def dml_irm_pyvsr_fixture(generate_data_irm, idx, score, dml_procedure):
    n_folds = 2

    # collect data
    (X, y, d) = generate_data_irm[idx]
    x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])]
    data = pd.DataFrame(np.column_stack((X, y, d)),
                        columns=x_cols + ['y', 'd'])

    # Set machine learning methods for m & g
    learner_classif = LogisticRegression(penalty='none', solver='newton-cg')
    learner_reg = LinearRegression()
    ml_g = clone(learner_reg)
    ml_m = clone(learner_classif)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_irm_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_irm_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_IRM(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_irm_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_irm_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
Exemplo n.º 5
0
def dml_iivm_pyvsr_fixture(generate_data_iivm, idx, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2

    # collect data
    data = generate_data_iivm[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & gg
    learner_classif = LogisticRegression(penalty='none', solver='newton-cg')
    learner_reg = LinearRegression()
    ml_g = clone(learner_reg)
    ml_m = clone(learner_classif)
    ml_r = clone(learner_classif)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_iivm_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_iivm_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_IIVM(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_iivm_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_iivm_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
def dml_plr_reestimate_fixture(generate_data1, learner, score, dml_procedure, n_rep):
    n_folds = 3

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  n_rep,
                                  score,
                                  dml_procedure)
    dml_plr_obj.fit()

    np.random.seed(3141)
    dml_plr_obj2 = dml.DoubleMLPLR(obj_dml_data,
                                   ml_g, ml_m,
                                   n_folds,
                                   n_rep,
                                   score,
                                   dml_procedure)
    dml_plr_obj2.fit()
    dml_plr_obj2._coef[0] = np.nan
    dml_plr_obj2._se[0] = np.nan
    dml_plr_obj2._est_causal_pars_and_se()

    res_dict = {'coef': dml_plr_obj.coef,
                'coef2': dml_plr_obj2.coef,
                'se': dml_plr_obj.se,
                'se2': dml_plr_obj2.se}

    return res_dict
Exemplo n.º 7
0
def dml_plr_pyvsr_fixture(generate_data1, idx, score, dml_procedure):
    n_folds = 2
    n_rep_boot = 483

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    learner = LinearRegression()
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    #np.random.seed(3141)
    dml_plr_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_plr_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_MLPLR(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_plr_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_plr_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
Exemplo n.º 8
0
def dml_plr_smpls_fixture(generate_data1, learner, score, dml_procedure,
                          n_rep):
    n_folds = 3

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep,
                                  score, dml_procedure)

    dml_plr_obj.fit()

    smpls = dml_plr_obj.smpls

    dml_plr_obj2 = dml.DoubleMLPLR(obj_dml_data,
                                   ml_g,
                                   ml_m,
                                   score=score,
                                   dml_procedure=dml_procedure,
                                   draw_sample_splitting=False)
    dml_plr_obj2.set_sample_splitting(smpls)
    dml_plr_obj2.fit()

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef2': dml_plr_obj2.coef,
        'se': dml_plr_obj.se,
        'se2': dml_plr_obj2.se
    }

    return res_dict
def dml_plr_no_cross_fit_fixture(generate_data1, idx, learner, score, n_folds):
    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values
    if n_folds == 1:
        smpls = [(np.arange(len(y)), np.arange(len(y)))]
    else:
        resampling = KFold(n_splits=n_folds, shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(X)]
        smpls = [smpls[0]]

    g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner), clone(learner),
                                    smpls)

    assert dml_procedure == 'dml1'
    res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y,
                                           d,
                                           g_hat,
                                           m_hat,
                                           smpls,
                                           score,
                                           se_manual,
                                           bootstrap,
                                           n_rep_boot,
                                           dml_procedure,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_no_cross_fit_tune_fixture(generate_data1, idx, learner, score,
                                      tune_on_folds):
    par_grid = {
        'ml_g': {
            'alpha': np.linspace(0.05, .95, 7)
        },
        'ml_m': {
            'alpha': np.linspace(0.05, .95, 7)
        }
    }
    n_folds_tune = 3

    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = Lasso()
    ml_m = Lasso()

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds=2,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    # tune hyperparameters
    res_tuning = dml_plr_obj.tune(par_grid,
                                  tune_on_folds=tune_on_folds,
                                  n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    X = obj_dml_data.x
    d = obj_dml_data.d

    resampling = KFold(n_splits=2, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]
    smpls = [smpls[0]]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m),
                                               clone(ml_g), smpls,
                                               n_folds_tune, par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g),
                                        smpls, g_params, m_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m),
                                               clone(ml_g), xx, n_folds_tune,
                                               par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g),
                                        smpls, g_params, m_params)

    assert dml_procedure == 'dml1'
    res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y,
                                           d,
                                           g_hat,
                                           m_hat,
                                           smpls,
                                           score,
                                           se_manual,
                                           bootstrap,
                                           n_rep_boot,
                                           dml_procedure,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_rep_no_cross_fit_fixture(generate_data1, idx, learner, score,
                                     n_rep):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 498
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  n_rep,
                                  score,
                                  dml_procedure,
                                  apply_cross_fitting=False)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values
    all_smpls = []
    for i_rep in range(n_rep):
        resampling = KFold(n_splits=n_folds, shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(X)]
        all_smpls.append(smpls)

    # adapt to do no-cross-fitting in each repetition
    all_smpls = [[xx[0]] for xx in all_smpls]

    thetas = np.zeros(n_rep)
    ses = np.zeros(n_rep)
    all_g_hat = list()
    all_m_hat = list()
    for i_rep in range(n_rep):
        smpls = all_smpls[i_rep]

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner),
                                        clone(learner), smpls)

        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)

        thetas[i_rep], ses[i_rep] = plr_dml1(y, X, d, all_g_hat[i_rep],
                                             all_m_hat[i_rep], smpls, score)

    res_manual = np.median(thetas)
    se_manual = np.sqrt(
        np.median(
            np.power(ses, 2) * len(smpls[0][1]) +
            np.power(thetas - res_manual, 2)) / len(smpls[0][1]))

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        all_boot_theta = list()
        all_boot_t_stat = list()
        for i_rep in range(n_rep):
            smpls = all_smpls[i_rep]
            boot_theta, boot_t_stat = boot_plr(thetas[i_rep],
                                               y,
                                               d,
                                               all_g_hat[i_rep],
                                               all_m_hat[i_rep],
                                               smpls,
                                               score,
                                               ses[i_rep],
                                               bootstrap,
                                               n_rep_boot,
                                               dml_procedure,
                                               apply_cross_fitting=False)
            all_boot_theta.append(boot_theta)
            all_boot_t_stat.append(boot_t_stat)

        boot_theta = np.hstack(all_boot_theta)
        boot_t_stat = np.hstack(all_boot_t_stat)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 12
0
def dml_pliv_fixture(generate_data_iv, learner, score, dml_procedure):
    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 503

    # collect data
    data = generate_data_iv
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'Z1')
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['Z1'].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    g_hat, m_hat, r_hat = fit_nuisance_pliv(y, x, d, z, clone(learner),
                                            clone(learner), clone(learner),
                                            smpls)

    if dml_procedure == 'dml1':
        res_manual, se_manual = pliv_dml1(y, x, d, z, g_hat, m_hat, r_hat,
                                          smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = pliv_dml2(y, x, d, z, g_hat, m_hat, r_hat,
                                          smpls, score)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual,
        'se': dml_pliv_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv(res_manual, y, d, z, g_hat, m_hat,
                                            r_hat, smpls, score, se_manual,
                                            bootstrap, n_rep_boot,
                                            dml_procedure)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_fixture(generate_data1, learner, score, dml_procedure, n_rep):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 498

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  n_rep,
                                  score,
                                  dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    n_obs = len(y)
    all_smpls = []
    for i_rep in range(n_rep):
        resampling = KFold(n_splits=n_folds,
                           shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(x)]
        all_smpls.append(smpls)

    thetas = np.zeros(n_rep)
    ses = np.zeros(n_rep)
    all_g_hat = list()
    all_m_hat = list()
    for i_rep in range(n_rep):
        smpls = all_smpls[i_rep]

        g_hat, m_hat = fit_nuisance_plr(y, x, d,
                                        clone(learner), clone(learner), smpls)

        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)

        if dml_procedure == 'dml1':
            thetas[i_rep], ses[i_rep] = plr_dml1(y, x, d,
                                                 all_g_hat[i_rep], all_m_hat[i_rep],
                                                 smpls, score)
        else:
            assert dml_procedure == 'dml2'
            thetas[i_rep], ses[i_rep] = plr_dml2(y, x, d,
                                                 all_g_hat[i_rep], all_m_hat[i_rep],
                                                 smpls, score)

    res_manual = np.median(thetas)
    se_manual = np.sqrt(np.median(np.power(ses, 2)*n_obs + np.power(thetas - res_manual, 2))/n_obs)

    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': res_manual,
                'se': dml_plr_obj.se,
                'se_manual': se_manual,
                'boot_methods': boot_methods
                }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        all_boot_theta = list()
        all_boot_t_stat = list()
        for i_rep in range(n_rep):
            smpls = all_smpls[i_rep]
            boot_theta, boot_t_stat = boot_plr(thetas[i_rep],
                                               y, d,
                                               all_g_hat[i_rep], all_m_hat[i_rep],
                                               smpls, score,
                                               ses[i_rep],
                                               bootstrap, n_rep_boot,
                                               dml_procedure)
            all_boot_theta.append(boot_theta)
            all_boot_t_stat.append(boot_t_stat)

        boot_theta = np.hstack(all_boot_theta)
        boot_t_stat = np.hstack(all_boot_t_stat)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 14
0
def dml_iivm_fixture(generate_data_iivm, learner_g, learner_m, learner_r,
                     score, dml_procedure, tune_on_folds):
    par_grid = {
        'ml_g': get_par_grid(learner_g),
        'ml_m': get_par_grid(learner_m),
        'ml_r': get_par_grid(learner_r)
    }
    n_folds_tune = 4

    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 491

    # collect data
    data = generate_data_iivm
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m, g & r
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)
    ml_r = clone(learner_r)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)
    # tune hyperparameters
    _ = dml_iivm_obj.tune(par_grid,
                          tune_on_folds=tune_on_folds,
                          n_folds_tune=n_folds_tune)

    dml_iivm_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['z'].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    if tune_on_folds:
        g0_params, g1_params, m_params,  r0_params, r1_params = \
            tune_nuisance_iivm(y, x, d, z,
                               clone(learner_m), clone(learner_g), clone(learner_r), smpls,
                               n_folds_tune,
                               par_grid['ml_g'], par_grid['ml_m'], par_grid['ml_r'])

        g_hat0, g_hat1, m_hat, r_hat0, r_hat1 = \
            fit_nuisance_iivm(y, x, d, z,
                              clone(learner_m), clone(learner_g), clone(learner_r), smpls,
                              g0_params, g1_params, m_params,  r0_params, r1_params)
    else:
        xx = [(np.arange(data.shape[0]), np.array([]))]
        g0_params, g1_params, m_params,  r0_params, r1_params = \
            tune_nuisance_iivm(y, x, d, z,
                               clone(learner_m), clone(learner_g), clone(learner_r), xx,
                               n_folds_tune,
                               par_grid['ml_g'], par_grid['ml_m'], par_grid['ml_r'])

        g_hat0, g_hat1, m_hat, r_hat0, r_hat1 = \
            fit_nuisance_iivm(y, x, d, z,
                              clone(learner_m), clone(learner_g), clone(learner_r), smpls,
                              g0_params * n_folds, g1_params * n_folds, m_params * n_folds,
                              r0_params * n_folds, r1_params * n_folds)

    if dml_procedure == 'dml1':
        res_manual, se_manual = iivm_dml1(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = iivm_dml2(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)

    res_dict = {
        'coef': dml_iivm_obj.coef,
        'coef_manual': res_manual,
        'se': dml_iivm_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_iivm(res_manual, y, d, z, g_hat0,
                                            g_hat1, m_hat, r_hat0, r_hat1,
                                            smpls, score, se_manual, bootstrap,
                                            n_rep_boot, dml_procedure)

        np.random.seed(3141)
        dml_iivm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_iivm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_iivm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 15
0
def dml_iivm_fixture(generate_data_iivm, learner, score, dml_procedure,
                     trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 491

    # collect data
    data = generate_data_iivm
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner[0])
    ml_m = clone(learner[1])
    ml_r = clone(learner[1])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    trimming_threshold=trimming_threshold)

    dml_iivm_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['z'].values
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_iivm(y,
                          x,
                          d,
                          z,
                          clone(learner[0]),
                          clone(learner[1]),
                          clone(learner[1]),
                          all_smpls,
                          dml_procedure,
                          score,
                          trimming_threshold=trimming_threshold)

    res_dict = {
        'coef': dml_iivm_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_iivm_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_iivm(
            y, d, z, res_manual['thetas'], res_manual['ses'],
            res_manual['all_g_hat0'], res_manual['all_g_hat1'],
            res_manual['all_m_hat'], res_manual['all_r_hat0'],
            res_manual['all_r_hat1'], all_smpls, score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_iivm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_iivm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_iivm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 16
0
def dml_iivm_fixture(generate_data_iivm, learner, score, dml_procedure,
                     trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 491

    # collect data
    data = generate_data_iivm
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner[1])
    ml_m = clone(learner[0])
    ml_r = clone(learner[0])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    trimming_threshold=trimming_threshold)

    dml_iivm_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['z'].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    g_hat0, g_hat1, m_hat, r_hat0, r_hat1 = fit_nuisance_iivm(
        y,
        x,
        d,
        z,
        clone(learner[0]),
        clone(learner[1]),
        clone(learner[0]),
        smpls,
        trimming_threshold=trimming_threshold)

    if dml_procedure == 'dml1':
        res_manual, se_manual = iivm_dml1(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = iivm_dml2(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)

    res_dict = {
        'coef': dml_iivm_obj.coef,
        'coef_manual': res_manual,
        'se': dml_iivm_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_iivm(res_manual, y, d, z, g_hat0,
                                            g_hat1, m_hat, r_hat0, r_hat1,
                                            smpls, score, se_manual, bootstrap,
                                            n_rep_boot, dml_procedure)

        np.random.seed(3141)
        dml_iivm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_iivm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_iivm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 17
0
def dml_plr_ols_manual_fixture(generate_data1, score, dml_procedure):
    learner = LinearRegression()
    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 501

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    n = data.shape[0]
    this_smpl = list()
    xx = int(n / 2)
    this_smpl.append((np.arange(xx, n), np.arange(0, xx)))
    this_smpl.append((np.arange(0, xx), np.arange(xx, n)))
    smpls = [this_smpl]
    dml_plr_obj.set_sample_splitting(smpls)

    dml_plr_obj.fit()

    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values

    # add column of ones for intercept
    o = np.ones((n, 1))
    x = np.append(x, o, axis=1)

    smpls = dml_plr_obj.smpls[0]

    g_hat = []
    for (train_index, test_index) in smpls:
        ols_est = scipy.linalg.lstsq(x[train_index], y[train_index])[0]
        g_hat.append(np.dot(x[test_index], ols_est))

    m_hat = []
    for (train_index, test_index) in smpls:
        ols_est = scipy.linalg.lstsq(x[train_index], d[train_index])[0]
        m_hat.append(np.dot(x[test_index], ols_est))

    if dml_procedure == 'dml1':
        res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y, d, [res_manual], [se_manual],
                                           [g_hat], [m_hat], [smpls], score,
                                           bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_pliv_partial_z_fixture(generate_data_pliv_partialZ, learner_r, score,
                               dml_procedure, tune_on_folds):
    par_grid = {'ml_r': get_par_grid(learner_r)}
    n_folds_tune = 4

    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 503

    # collect data
    data = generate_data_pliv_partialZ
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()
    z_cols = data.columns[data.columns.str.startswith('Z')].tolist()

    # Set machine learning methods for r
    ml_r = clone(learner_r)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, z_cols)
    dml_pliv_obj = dml.DoubleMLPLIV._partialZ(obj_dml_data,
                                              ml_r,
                                              n_folds,
                                              dml_procedure=dml_procedure)

    # tune hyperparameters
    _ = dml_pliv_obj.tune(par_grid,
                          tune_on_folds=tune_on_folds,
                          n_folds_tune=n_folds_tune)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data.loc[:, z_cols].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    if tune_on_folds:
        r_params = tune_nuisance_pliv_partial_z(y, x, d, z, clone(learner_r),
                                                smpls, n_folds_tune,
                                                par_grid['ml_r'])

        r_hat = fit_nuisance_pliv_partial_z(y, x, d, z, clone(learner_r),
                                            smpls, r_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        r_params = tune_nuisance_pliv_partial_z(y, x, d, z, clone(learner_r),
                                                xx, n_folds_tune,
                                                par_grid['ml_r'])

        r_hat = fit_nuisance_pliv_partial_z(y, x, d, z, clone(learner_r),
                                            smpls, r_params * n_folds)

    if dml_procedure == 'dml1':
        res_manual, se_manual = pliv_partial_z_dml1(y, x, d, z, r_hat, smpls,
                                                    score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = pliv_partial_z_dml2(y, x, d, z, r_hat, smpls,
                                                    score)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual,
        'se': dml_pliv_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv_partial_z(res_manual, y, d, z,
                                                      r_hat, smpls, score,
                                                      se_manual, bootstrap,
                                                      n_rep_boot,
                                                      dml_procedure)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 19
0
def dml_pliv_no_cross_fit_fixture(generate_data_iv, learner, score, n_folds):
    boot_methods = ['normal']
    n_rep_boot = 503
    dml_procedure = 'dml1'

    # collect data
    data = generate_data_iv
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'Z1')
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    apply_cross_fitting=False)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['Z1'].values
    if n_folds == 1:
        smpls = [(np.arange(len(y)), np.arange(len(y)))]
    else:
        n_obs = len(y)
        all_smpls = draw_smpls(n_obs, n_folds)
        smpls = all_smpls[0]
        smpls = [smpls[0]]

    res_manual = fit_pliv(y, x, d, z, clone(learner), clone(learner),
                          clone(learner), [smpls], dml_procedure, score)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_pliv_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv(y,
                                            d,
                                            z,
                                            res_manual['thetas'],
                                            res_manual['ses'],
                                            res_manual['all_g_hat'],
                                            res_manual['all_m_hat'],
                                            res_manual['all_r_hat'], [smpls],
                                            score,
                                            bootstrap,
                                            n_rep_boot,
                                            apply_cross_fitting=False)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_fixture(generate_data1, idx, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    alpha = 0.05
    learner = Lasso(alpha=alpha)
    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'])
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    learner = Lasso()
    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    dml_plr_obj_ext_set_par = dml.DoubleMLPLR(obj_dml_data,
                                              ml_g,
                                              ml_m,
                                              n_folds,
                                              score=score,
                                              dml_procedure=dml_procedure)
    dml_plr_obj_ext_set_par.set_ml_nuisance_params('ml_g', 'd',
                                                   {'alpha': alpha})
    dml_plr_obj_ext_set_par.set_ml_nuisance_params('ml_m', 'd',
                                                   {'alpha': alpha})
    dml_plr_obj_ext_set_par.fit()

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': dml_plr_obj_ext_set_par.coef,
        'se': dml_plr_obj.se,
        'se_manual': dml_plr_obj_ext_set_par.se,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(314122)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat

        np.random.seed(314122)
        dml_plr_obj_ext_set_par.bootstrap(method=bootstrap,
                                          n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap +
                 '_manual'] = dml_plr_obj_ext_set_par.boot_coef
        res_dict['boot_t_stat' + bootstrap +
                 '_manual'] = dml_plr_obj_ext_set_par.boot_t_stat

    return res_dict
def dml_plr_multitreat_fixture(generate_data_bivariate, generate_data_toeplitz, idx, learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 483

    # collect data
    if idx < n_datasets:
        data = generate_data_bivariate[idx]
    else:
        data = generate_data_toeplitz[idx-n_datasets]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()
    d_cols = data.columns[data.columns.str.startswith('d')].tolist()
    
    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', d_cols, X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()
    
    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data.loc[:, d_cols].values
    resampling = KFold(n_splits=n_folds,
                       shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]
    
    n_d = d.shape[1]
    
    coef_manual = np.full(n_d, np.nan)
    se_manual = np.full(n_d, np.nan)
    
    all_g_hat = []
    all_m_hat = []
    
    for i_d in range(n_d):
        
        Xd = np.hstack((X, np.delete(d, i_d , axis=1)))
        
        g_hat, m_hat = fit_nuisance_plr(y, Xd, d[:, i_d],
                                        clone(learner), clone(learner), smpls)
        
        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)
        
        if dml_procedure == 'dml1':
            coef_manual[i_d], se_manual[i_d] = plr_dml1(y, Xd, d[:, i_d],
                                                        g_hat, m_hat,
                                                        smpls, score)
        elif dml_procedure == 'dml2':
            coef_manual[i_d], se_manual[i_d] = plr_dml2(y, Xd, d[:, i_d],
                                                        g_hat, m_hat,
                                                        smpls, score)
                   
    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': coef_manual,
                'se': dml_plr_obj.se,
                'se_manual': se_manual,
                'boot_methods': boot_methods}
    
    
    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(coef_manual,
                                           y, d,
                                           all_g_hat, all_m_hat,
                                           smpls, score,
                                           se_manual,
                                           bootstrap, n_rep_boot,
                                           dml_procedure)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat
    
    return res_dict
Exemplo n.º 22
0
def dml_pliv_fixture(generate_data_iv, learner_g, learner_m, learner_r, score,
                     dml_procedure, tune_on_folds):
    par_grid = {
        'ml_g': get_par_grid(learner_g),
        'ml_m': get_par_grid(learner_m),
        'ml_r': get_par_grid(learner_r)
    }
    n_folds_tune = 4

    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 503

    # collect data
    data = generate_data_iv
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)
    ml_r = clone(learner_r)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'Z1')
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    # tune hyperparameters
    _ = dml_pliv_obj.tune(par_grid,
                          tune_on_folds=tune_on_folds,
                          n_folds_tune=n_folds_tune)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['Z1'].values
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)
    smpls = all_smpls[0]

    if tune_on_folds:
        g_params, m_params, r_params = tune_nuisance_pliv(
            y, x, d, z, clone(learner_g), clone(learner_m), clone(learner_r),
            smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m'],
            par_grid['ml_r'])
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params, r_params = tune_nuisance_pliv(
            y, x, d, z, clone(learner_g), clone(learner_m), clone(learner_r),
            xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m'],
            par_grid['ml_r'])
        g_params = g_params * n_folds
        m_params = m_params * n_folds
        r_params = r_params * n_folds

    res_manual = fit_pliv(y,
                          x,
                          d,
                          z,
                          clone(learner_g),
                          clone(learner_m),
                          clone(learner_r),
                          all_smpls,
                          dml_procedure,
                          score,
                          g_params=g_params,
                          m_params=m_params,
                          r_params=r_params)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_pliv_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv(y, d, z, res_manual['thetas'],
                                            res_manual['ses'],
                                            res_manual['all_g_hat'],
                                            res_manual['all_m_hat'],
                                            res_manual['all_r_hat'], all_smpls,
                                            score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 23
0
def dml_plr_no_cross_fit_tune_fixture(generate_data1, learner, score,
                                      tune_on_folds):
    par_grid = {
        'ml_g': {
            'alpha': np.linspace(0.05, .95, 7)
        },
        'ml_m': {
            'alpha': np.linspace(0.05, .95, 7)
        }
    }
    n_folds_tune = 3

    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = Lasso()
    ml_m = Lasso()

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds=2,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    # tune hyperparameters
    _ = dml_plr_obj.tune(par_grid,
                         tune_on_folds=tune_on_folds,
                         n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    x = obj_dml_data.x
    d = obj_dml_data.d
    n_obs = len(y)

    all_smpls = draw_smpls(n_obs, 2)
    smpls = all_smpls[0]
    smpls = [smpls[0]]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, x, d, clone(ml_g),
                                               clone(ml_m), smpls,
                                               n_folds_tune, par_grid['ml_g'],
                                               par_grid['ml_m'])
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, x, d, clone(ml_g),
                                               clone(ml_m), xx, n_folds_tune,
                                               par_grid['ml_g'],
                                               par_grid['ml_m'])

    res_manual = fit_plr(y,
                         x,
                         d,
                         clone(ml_m),
                         clone(ml_g), [smpls],
                         dml_procedure,
                         score,
                         g_params=g_params,
                         m_params=m_params)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_plr_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y,
                                           d,
                                           res_manual['thetas'],
                                           res_manual['ses'],
                                           res_manual['all_g_hat'],
                                           res_manual['all_m_hat'], [smpls],
                                           score,
                                           bootstrap,
                                           n_rep_boot,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict