def dml_pliv_multiway_cluster_old_vs_new_fixture(generate_data_iv, learner):
    n_folds = 3
    dml_procedure = 'dml1'  # same results are only obtained for dml1

    np.random.seed(3141)
    smpl_sizes = [N, M]
    obj_dml_multiway_resampling = DoubleMLMultiwayResampling(
        n_folds, smpl_sizes)
    _, smpls_lin_ind = obj_dml_multiway_resampling.split_samples()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    df = obj_dml_cluster_data.data.set_index(
        ['cluster_var_i', 'cluster_var_j'])
    obj_dml_data = dml.DoubleMLData(df,
                                    y_col=obj_dml_cluster_data.y_col,
                                    d_cols=obj_dml_cluster_data.d_cols,
                                    x_cols=obj_dml_cluster_data.x_cols,
                                    z_cols=obj_dml_cluster_data.z_cols)

    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    draw_sample_splitting=False)
    dml_pliv_obj.set_sample_splitting(smpls_lin_ind)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    dml_pliv_obj_cluster = dml.DoubleMLPLIV(obj_dml_cluster_data,
                                            ml_g,
                                            ml_m,
                                            ml_r,
                                            n_folds,
                                            dml_procedure=dml_procedure)
    dml_pliv_obj_cluster.fit()

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': dml_pliv_obj_cluster.coef
    }

    return res_dict
def dml_pliv_oneway_cluster_fixture(generate_data_iv, learner, dml_procedure):
    n_folds = 3
    score = 'partialling out'

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    np.random.seed(3141)
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_oneway_cluster_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    score=score,
                                    dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = obj_dml_oneway_cluster_data.y
    x = obj_dml_oneway_cluster_data.x
    d = obj_dml_oneway_cluster_data.d
    z = np.ravel(obj_dml_oneway_cluster_data.z)

    res_manual = fit_pliv(y, x, d, z, clone(learner), clone(learner),
                          clone(learner), dml_pliv_obj.smpls, dml_procedure,
                          score)
    g_hat = res_manual['all_g_hat'][0]
    m_hat = res_manual['all_m_hat'][0]
    r_hat = res_manual['all_r_hat'][0]
    smpls_one_split = dml_pliv_obj.smpls[0]
    u_hat, v_hat, w_hat = compute_pliv_residuals(y, d, z, g_hat, m_hat, r_hat,
                                                 smpls_one_split)

    psi_a = -np.multiply(v_hat, w_hat)
    if dml_procedure == 'dml2':
        psi_b = np.multiply(v_hat, u_hat)
        theta = est_one_way_cluster_dml2(
            psi_a, psi_b, obj_dml_oneway_cluster_data.cluster_vars[:, 0],
            smpls_one_split)
    else:
        theta = res_manual['theta']
    psi = np.multiply(u_hat - w_hat * theta, v_hat)
    var = var_one_way_cluster(psi, psi_a,
                              obj_dml_oneway_cluster_data.cluster_vars[:, 0],
                              smpls_one_split)
    se = np.sqrt(var)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'se': dml_pliv_obj.se,
        'coef_manual': theta,
        'se_manual': se
    }

    return res_dict
Exemplo n.º 3
0
def dml_pliv_multiway_cluster_fixture(generate_data_iv, learner, score,
                                      dml_procedure):
    n_folds = 3

    np.random.seed(1234)
    smpl_sizes = [N, M]
    obj_dml_multiway_resampling = DoubleMLMultiwayResampling(
        n_folds, smpl_sizes)
    _, smpls_lin_ind = obj_dml_multiway_resampling.split_samples()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    np.random.seed(3141)
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure,
                                    draw_sample_splitting=False)
    dml_pliv_obj.set_sample_splitting(smpls_lin_ind)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    x = obj_dml_data.x
    d = obj_dml_data.d
    z = np.ravel(obj_dml_data.z)

    g_hat, m_hat, r_hat = fit_nuisance_pliv(y, x, d, z, clone(learner),
                                            clone(learner), clone(learner),
                                            smpls_lin_ind)

    if dml_procedure == 'dml1':
        res_manual, _ = pliv_dml1(y, x, d, z, g_hat, m_hat, r_hat,
                                  smpls_lin_ind, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, _ = pliv_dml2(y, x, d, z, g_hat, m_hat, r_hat,
                                  smpls_lin_ind, score)

    res_dict = {'coef': dml_pliv_obj.coef, 'coef_manual': res_manual}

    return res_dict
Exemplo n.º 4
0
def dml_pliv_fixture(generate_data_iv, learner_g, learner_m, learner_r, score,
                     dml_procedure, tune_on_folds):
    par_grid = {
        'ml_g': get_par_grid(learner_g),
        'ml_m': get_par_grid(learner_m),
        'ml_r': get_par_grid(learner_r)
    }
    n_folds_tune = 4

    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 503

    # collect data
    data = generate_data_iv
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)
    ml_r = clone(learner_r)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'Z1')
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    # tune hyperparameters
    _ = dml_pliv_obj.tune(par_grid,
                          tune_on_folds=tune_on_folds,
                          n_folds_tune=n_folds_tune)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['Z1'].values
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)
    smpls = all_smpls[0]

    if tune_on_folds:
        g_params, m_params, r_params = tune_nuisance_pliv(
            y, x, d, z, clone(learner_g), clone(learner_m), clone(learner_r),
            smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m'],
            par_grid['ml_r'])
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params, r_params = tune_nuisance_pliv(
            y, x, d, z, clone(learner_g), clone(learner_m), clone(learner_r),
            xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m'],
            par_grid['ml_r'])
        g_params = g_params * n_folds
        m_params = m_params * n_folds
        r_params = r_params * n_folds

    res_manual = fit_pliv(y,
                          x,
                          d,
                          z,
                          clone(learner_g),
                          clone(learner_m),
                          clone(learner_r),
                          all_smpls,
                          dml_procedure,
                          score,
                          g_params=g_params,
                          m_params=m_params,
                          r_params=r_params)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_pliv_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv(y, d, z, res_manual['thetas'],
                                            res_manual['ses'],
                                            res_manual['all_g_hat'],
                                            res_manual['all_m_hat'],
                                            res_manual['all_r_hat'], all_smpls,
                                            score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 5
0
def dml_pliv_fixture(generate_data_iv, learner, score, dml_procedure):
    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 503

    # collect data
    data = generate_data_iv
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'Z1')
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['Z1'].values
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_pliv(y, x, d, z, clone(learner), clone(learner),
                          clone(learner), all_smpls, dml_procedure, score)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_pliv_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv(y, d, z, res_manual['thetas'],
                                            res_manual['ses'],
                                            res_manual['all_g_hat'],
                                            res_manual['all_m_hat'],
                                            res_manual['all_r_hat'], all_smpls,
                                            score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 6
0
def dml_pliv_fixture(generate_data_iv, learner, score, dml_procedure):
    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 503

    # collect data
    data = generate_data_iv
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols, 'Z1')
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    z = data['Z1'].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    g_hat, m_hat, r_hat = fit_nuisance_pliv(y, x, d, z, clone(learner),
                                            clone(learner), clone(learner),
                                            smpls)

    if dml_procedure == 'dml1':
        res_manual, se_manual = pliv_dml1(y, x, d, z, g_hat, m_hat, r_hat,
                                          smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = pliv_dml2(y, x, d, z, g_hat, m_hat, r_hat,
                                          smpls, score)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual,
        'se': dml_pliv_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv(res_manual, y, d, z, g_hat, m_hat,
                                            r_hat, smpls, score, se_manual,
                                            bootstrap, n_rep_boot,
                                            dml_procedure)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemplo n.º 7
0
def dml_pliv_fixture(generate_data_iv, idx, learner_g, learner_m, learner_r,
                     score, dml_procedure, tune_on_folds):
    par_grid = {
        'ml_g': get_par_grid(learner_g),
        'ml_m': get_par_grid(learner_m),
        'ml_r': get_par_grid(learner_r)
    }
    n_folds_tune = 4

    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 503

    # collect data
    data = generate_data_iv[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for g, m & r
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)
    ml_r = clone(learner_r)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols, 'Z1')
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    # tune hyperparameters
    res_tuning = dml_pliv_obj.tune(par_grid,
                                   tune_on_folds=tune_on_folds,
                                   n_folds_tune=n_folds_tune)

    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values
    z = data['Z1'].values
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]

    if tune_on_folds:
        g_params, m_params, r_params = tune_nuisance_pliv(
            y, X, d, z, clone(learner_m), clone(learner_g), clone(learner_r),
            smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m'],
            par_grid['ml_r'])

        g_hat, m_hat, r_hat = fit_nuisance_pliv(y, X, d, z, clone(learner_m),
                                                clone(learner_g),
                                                clone(learner_r), smpls,
                                                g_params, m_params, r_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params, r_params = tune_nuisance_pliv(
            y, X, d, z, clone(learner_m), clone(learner_g), clone(learner_r),
            xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m'],
            par_grid['ml_r'])

        g_hat, m_hat, r_hat = fit_nuisance_pliv(y, X, d, z, clone(learner_m),
                                                clone(learner_g),
                                                clone(learner_r), smpls,
                                                g_params * n_folds,
                                                m_params * n_folds,
                                                r_params * n_folds)

    if dml_procedure == 'dml1':
        res_manual, se_manual = pliv_dml1(y, X, d, z, g_hat, m_hat, r_hat,
                                          smpls, score)
    elif dml_procedure == 'dml2':
        res_manual, se_manual = pliv_dml2(y, X, d, z, g_hat, m_hat, r_hat,
                                          smpls, score)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'coef_manual': res_manual,
        'se': dml_pliv_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_pliv(res_manual, y, d, z, g_hat, m_hat,
                                            r_hat, smpls, score, se_manual,
                                            bootstrap, n_rep_boot,
                                            dml_procedure)

        np.random.seed(3141)
        dml_pliv_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_pliv_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_pliv_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_pliv_multiway_cluster_fixture(generate_data_iv, learner,
                                      dml_procedure):
    n_folds = 2
    n_rep = 2
    score = 'partialling out'

    # Set machine learning methods for g, m & r
    ml_g = clone(learner)
    ml_m = clone(learner)
    ml_r = clone(learner)

    np.random.seed(3141)
    dml_pliv_obj = dml.DoubleMLPLIV(obj_dml_cluster_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    n_rep=n_rep,
                                    score=score,
                                    dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_pliv_obj.fit()

    np.random.seed(3141)
    y = obj_dml_cluster_data.y
    x = obj_dml_cluster_data.x
    d = obj_dml_cluster_data.d
    z = np.ravel(obj_dml_cluster_data.z)

    res_manual = fit_pliv(y,
                          x,
                          d,
                          z,
                          clone(learner),
                          clone(learner),
                          clone(learner),
                          dml_pliv_obj.smpls,
                          dml_procedure,
                          score,
                          n_rep=n_rep)
    thetas = np.full(n_rep, np.nan)
    ses = np.full(n_rep, np.nan)
    for i_rep in range(n_rep):
        g_hat = res_manual['all_g_hat'][i_rep]
        m_hat = res_manual['all_m_hat'][i_rep]
        r_hat = res_manual['all_r_hat'][i_rep]
        smpls_one_split = dml_pliv_obj.smpls[i_rep]
        u_hat, v_hat, w_hat = compute_pliv_residuals(y, d, z, g_hat, m_hat,
                                                     r_hat, smpls_one_split)

        psi_a = -np.multiply(v_hat, w_hat)
        if dml_procedure == 'dml2':
            psi_b = np.multiply(v_hat, u_hat)
            theta = est_two_way_cluster_dml2(
                psi_a, psi_b, obj_dml_cluster_data.cluster_vars[:, 0],
                obj_dml_cluster_data.cluster_vars[:, 1], smpls_one_split)
        else:
            theta = res_manual['thetas'][i_rep]
        psi = np.multiply(u_hat - w_hat * theta, v_hat)
        var = var_two_way_cluster(psi, psi_a,
                                  obj_dml_cluster_data.cluster_vars[:, 0],
                                  obj_dml_cluster_data.cluster_vars[:, 1],
                                  smpls_one_split)
        se = np.sqrt(var)
        thetas[i_rep] = theta
        ses[i_rep] = se

    theta = np.median(thetas)
    n_clusters1 = len(np.unique(obj_dml_cluster_data.cluster_vars[:, 0]))
    n_clusters2 = len(np.unique(obj_dml_cluster_data.cluster_vars[:, 1]))
    var_scaling_factor = min(n_clusters1, n_clusters2)
    se = np.sqrt(
        np.median(
            np.power(ses, 2) * var_scaling_factor +
            np.power(thetas - theta, 2)) / var_scaling_factor)

    res_dict = {
        'coef': dml_pliv_obj.coef,
        'se': dml_pliv_obj.se,
        'coef_manual': theta,
        'se_manual': se
    }

    return res_dict