예제 #1
0
def test_obj_vs_from_arrays():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = _make_pliv_data(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5)
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols])
    assert np.array_equal(dml_data_from_array.data,
                          dml_data.data)  # z_cols name differ

    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLData(df, 'y', ['d1', 'd2'],
                            [f'X{i+1}' for i in np.arange(7)])
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols])
    assert np.array_equal(dml_data_from_array.data, dml_data.data)
예제 #2
0
def test_obj_vs_from_arrays():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = _make_pliv_data(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols],
                                                   dml_data.data[dml_data.z_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5)
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols],
                                                   dml_data.data[dml_data.z_cols])
    assert np.array_equal(dml_data_from_array.data, dml_data.data)  # z_cols name differ

    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)])
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols])
    assert np.array_equal(dml_data_from_array.data, dml_data.data)

    dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
    dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols],
                                                          dml_data.data[dml_data.y_col],
                                                          dml_data.data[dml_data.d_cols],
                                                          dml_data.data[dml_data.cluster_cols],
                                                          dml_data.data[dml_data.z_cols])
    df = dml_data.data.copy()
    df.rename(columns={'cluster_var_i': 'cluster_var1',
                       'cluster_var_j': 'cluster_var2',
                       'Y': 'y', 'D': 'd', 'Z': 'z'},
              inplace=True)
    assert dml_data_from_array.data.equals(df)

    # with a single cluster variable
    dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols],
                                                          dml_data.data[dml_data.y_col],
                                                          dml_data.data[dml_data.d_cols],
                                                          dml_data.data[dml_data.cluster_cols[1]],
                                                          dml_data.data[dml_data.z_cols])
    df = dml_data.data.copy().drop(columns='cluster_var_i')
    df.rename(columns={'cluster_var_j': 'cluster_var',
                       'Y': 'y', 'D': 'd', 'Z': 'z'},
              inplace=True)
    assert dml_data_from_array.data.equals(df)
예제 #3
0
def test_dml_data_w_missings(generate_data_irm_w_missings):
    (x, y, d) = generate_data_irm_w_missings

    dml_data = DoubleMLData.from_arrays(x, y, d,
                                        force_all_x_finite=False)

    _ = DoubleMLData.from_arrays(x, y, d,
                                 force_all_x_finite='allow-nan')

    msg = r"Input contains NaN, infinity or a value too large for dtype\('float64'\)."
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData.from_arrays(x, y, d,
                                     force_all_x_finite=True)

    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData.from_arrays(x, x[:, 0], d,
                                     force_all_x_finite=False)

    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData.from_arrays(x, y, x[:, 0],
                                     force_all_x_finite=False)

    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData.from_arrays(x, y, d, x[:, 0],
                                     force_all_x_finite=False)

    msg = r"Input contains infinity or a value too large for dtype\('float64'\)."
    xx = np.copy(x)
    xx[0, 0] = np.inf
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData.from_arrays(xx, y, d,
                                     force_all_x_finite='allow-nan')

    msg = "Invalid force_all_x_finite. force_all_x_finite must be True, False or 'allow-nan'."
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLData.from_arrays(xx, y, d,
                                     force_all_x_finite=1)
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLData(dml_data.data,
                         y_col='y', d_cols='d',
                         force_all_x_finite=1)

    msg = "Invalid force_all_x_finite allownan. force_all_x_finite must be True, False or 'allow-nan'."
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData.from_arrays(xx, y, d,
                                     force_all_x_finite='allownan')
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData(dml_data.data,
                         y_col='y', d_cols='d',
                         force_all_x_finite='allownan')

    msg = r"Input contains NaN, infinity or a value too large for dtype\('float64'\)."
    with pytest.raises(ValueError, match=msg):
        dml_data.force_all_x_finite = True

    assert dml_data.force_all_x_finite is True
    dml_data.force_all_x_finite = False
    assert dml_data.force_all_x_finite is False
    dml_data.force_all_x_finite = 'allow-nan'
    assert dml_data.force_all_x_finite == 'allow-nan'
예제 #4
0
def test_dml_data_no_instr():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    assert dml_data.z is None
    assert dml_data.n_instr == 0

    x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type='array')
    dml_data = DoubleMLData.from_arrays(x, y, d)
    assert dml_data.z is None
    assert dml_data.n_instr == 0
예제 #5
0
def dml_data_fixture(generate_data1):
    data = generate_data1
    np.random.seed(3141)
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    obj_from_np = DoubleMLData.from_arrays(data.loc[:, x_cols].values,
                                           data['y'].values, data['d'].values)

    obj_from_pd = DoubleMLData(data, 'y', ['d'], x_cols)

    return {'obj_from_np': obj_from_np, 'obj_from_pd': obj_from_pd}
np.random.seed(3141)
dml_data = make_plr_CCDDHNR2018(n_obs=10)
ml_g = Lasso()
ml_m = Lasso()
ml_r = Lasso()
dml_plr = DoubleMLPLR(dml_data, ml_g, ml_m)

dml_data_irm = make_irm_data(n_obs=10)
dml_data_iivm = make_iivm_data(n_obs=10)
dml_data_pliv = make_pliv_CHS2015(n_obs=10, dim_z=1)
dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
(x, y, d, z) = make_iivm_data(n_obs=30, return_type="array")
y[y > 0] = 1
y[y < 0] = 0
dml_data_irm_binary_outcome = DoubleMLData.from_arrays(x, y, d)
dml_data_iivm_binary_outcome = DoubleMLData.from_arrays(x, y, d, z)


@pytest.mark.ci
def test_doubleml_exception_data():
    msg = 'The data must be of DoubleMLData type.'
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLPLR(pd.DataFrame(), ml_g, ml_m)

    # PLR with IV
    msg = (r'Incompatible data. Z1 have been set as instrumental variable\(s\). '
           'To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR.')
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLPLR(dml_data_pliv, ml_g, ml_m)