def dml_iivm_pyvsr_fixture(generate_data_iivm, idx, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2

    # collect data
    data = generate_data_iivm[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & gg
    learner_classif = LogisticRegression(penalty='none', solver='newton-cg')
    learner_reg = LinearRegression()
    ml_g = clone(learner_reg)
    ml_m = clone(learner_classif)
    ml_r = clone(learner_classif)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_iivm_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_iivm_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_IIVM(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_iivm_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_iivm_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
def dml_iivm_fixture(generate_data_iivm, learner_g, learner_m, learner_r,
                     score, dml_procedure, tune_on_folds):
    par_grid = {
        'ml_g': get_par_grid(learner_g),
        'ml_m': get_par_grid(learner_m),
        'ml_r': get_par_grid(learner_r)
    }
    n_folds_tune = 4

    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 491

    # collect data
    data = generate_data_iivm
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m, g & r
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)
    ml_r = clone(learner_r)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)
    # tune hyperparameters
    _ = dml_iivm_obj.tune(par_grid,
                          tune_on_folds=tune_on_folds,
                          n_folds_tune=n_folds_tune)

    dml_iivm_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['z'].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    if tune_on_folds:
        g0_params, g1_params, m_params,  r0_params, r1_params = \
            tune_nuisance_iivm(y, x, d, z,
                               clone(learner_m), clone(learner_g), clone(learner_r), smpls,
                               n_folds_tune,
                               par_grid['ml_g'], par_grid['ml_m'], par_grid['ml_r'])

        g_hat0, g_hat1, m_hat, r_hat0, r_hat1 = \
            fit_nuisance_iivm(y, x, d, z,
                              clone(learner_m), clone(learner_g), clone(learner_r), smpls,
                              g0_params, g1_params, m_params,  r0_params, r1_params)
    else:
        xx = [(np.arange(data.shape[0]), np.array([]))]
        g0_params, g1_params, m_params,  r0_params, r1_params = \
            tune_nuisance_iivm(y, x, d, z,
                               clone(learner_m), clone(learner_g), clone(learner_r), xx,
                               n_folds_tune,
                               par_grid['ml_g'], par_grid['ml_m'], par_grid['ml_r'])

        g_hat0, g_hat1, m_hat, r_hat0, r_hat1 = \
            fit_nuisance_iivm(y, x, d, z,
                              clone(learner_m), clone(learner_g), clone(learner_r), smpls,
                              g0_params * n_folds, g1_params * n_folds, m_params * n_folds,
                              r0_params * n_folds, r1_params * n_folds)

    if dml_procedure == 'dml1':
        res_manual, se_manual = iivm_dml1(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = iivm_dml2(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)

    res_dict = {
        'coef': dml_iivm_obj.coef,
        'coef_manual': res_manual,
        'se': dml_iivm_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_iivm(res_manual, y, d, z, g_hat0,
                                            g_hat1, m_hat, r_hat0, r_hat1,
                                            smpls, score, se_manual, bootstrap,
                                            n_rep_boot, dml_procedure)

        np.random.seed(3141)
        dml_iivm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_iivm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_iivm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
示例#3
0
def dml_iivm_fixture(generate_data_iivm, learner, score, dml_procedure,
                     trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 491

    # collect data
    data = generate_data_iivm
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner[0])
    ml_m = clone(learner[1])
    ml_r = clone(learner[1])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    trimming_threshold=trimming_threshold)

    dml_iivm_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['z'].values
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_iivm(y,
                          x,
                          d,
                          z,
                          clone(learner[0]),
                          clone(learner[1]),
                          clone(learner[1]),
                          all_smpls,
                          dml_procedure,
                          score,
                          trimming_threshold=trimming_threshold)

    res_dict = {
        'coef': dml_iivm_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_iivm_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_iivm(
            y, d, z, res_manual['thetas'], res_manual['ses'],
            res_manual['all_g_hat0'], res_manual['all_g_hat1'],
            res_manual['all_m_hat'], res_manual['all_r_hat0'],
            res_manual['all_r_hat1'], all_smpls, score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_iivm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_iivm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_iivm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
示例#4
0
def dml_iivm_fixture(generate_data_iivm, learner, score, dml_procedure,
                     trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 491

    # collect data
    data = generate_data_iivm
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner[1])
    ml_m = clone(learner[0])
    ml_r = clone(learner[0])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    trimming_threshold=trimming_threshold)

    dml_iivm_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['z'].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    g_hat0, g_hat1, m_hat, r_hat0, r_hat1 = fit_nuisance_iivm(
        y,
        x,
        d,
        z,
        clone(learner[0]),
        clone(learner[1]),
        clone(learner[0]),
        smpls,
        trimming_threshold=trimming_threshold)

    if dml_procedure == 'dml1':
        res_manual, se_manual = iivm_dml1(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = iivm_dml2(y, x, d, z, g_hat0, g_hat1, m_hat,
                                          r_hat0, r_hat1, smpls, score)

    res_dict = {
        'coef': dml_iivm_obj.coef,
        'coef_manual': res_manual,
        'se': dml_iivm_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_iivm(res_manual, y, d, z, g_hat0,
                                            g_hat1, m_hat, r_hat0, r_hat1,
                                            smpls, score, se_manual, bootstrap,
                                            n_rep_boot, dml_procedure)

        np.random.seed(3141)
        dml_iivm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_iivm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_iivm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
示例#5
0
def dml_iivm_classifier_fixture(generate_data_iivm_binary, learner, score,
                                dml_procedure, trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 491

    # collect data
    (x, y, d, z) = generate_data_iivm_binary

    # Set machine learning methods for m & g
    ml_g = clone(learner[0])
    ml_m = clone(learner[1])
    ml_r = clone(learner[1])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, z)
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    trimming_threshold=trimming_threshold)

    dml_iivm_obj.fit()

    np.random.seed(3141)
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_iivm(y,
                          x,
                          d,
                          z,
                          clone(learner[0]),
                          clone(learner[1]),
                          clone(learner[1]),
                          all_smpls,
                          dml_procedure,
                          score,
                          trimming_threshold=trimming_threshold)

    res_dict = {
        'coef': dml_iivm_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_iivm_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_iivm(
            y, d, z, res_manual['thetas'], res_manual['ses'],
            res_manual['all_g_hat0'], res_manual['all_g_hat1'],
            res_manual['all_m_hat'], res_manual['all_r_hat0'],
            res_manual['all_r_hat1'], all_smpls, score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_iivm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_iivm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_iivm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict