예제 #1
0
def test_obj_vs_from_arrays():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = _make_pliv_data(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5)
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols])
    assert np.array_equal(dml_data_from_array.data,
                          dml_data.data)  # z_cols name differ

    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLData(df, 'y', ['d1', 'd2'],
                            [f'X{i+1}' for i in np.arange(7)])
    dml_data_from_array = DoubleMLData.from_arrays(
        dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col],
        dml_data.data[dml_data.d_cols])
    assert np.array_equal(dml_data_from_array.data, dml_data.data)
예제 #2
0
def test_dml_data_no_instr():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    assert dml_data.z is None
    assert dml_data.n_instr == 0

    x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type='array')
    dml_data = DoubleMLData.from_arrays(x, y, d)
    assert dml_data.z is None
    assert dml_data.n_instr == 0
예제 #3
0
def test_obj_vs_from_arrays():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = _make_pliv_data(n_obs=100)
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols],
                                                   dml_data.data[dml_data.z_cols])
    assert dml_data_from_array.data.equals(dml_data.data)

    dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5)
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols],
                                                   dml_data.data[dml_data.z_cols])
    assert np.array_equal(dml_data_from_array.data, dml_data.data)  # z_cols name differ

    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)])
    dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols],
                                                   dml_data.data[dml_data.y_col],
                                                   dml_data.data[dml_data.d_cols])
    assert np.array_equal(dml_data_from_array.data, dml_data.data)

    dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
    dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols],
                                                          dml_data.data[dml_data.y_col],
                                                          dml_data.data[dml_data.d_cols],
                                                          dml_data.data[dml_data.cluster_cols],
                                                          dml_data.data[dml_data.z_cols])
    df = dml_data.data.copy()
    df.rename(columns={'cluster_var_i': 'cluster_var1',
                       'cluster_var_j': 'cluster_var2',
                       'Y': 'y', 'D': 'd', 'Z': 'z'},
              inplace=True)
    assert dml_data_from_array.data.equals(df)

    # with a single cluster variable
    dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols],
                                                          dml_data.data[dml_data.y_col],
                                                          dml_data.data[dml_data.d_cols],
                                                          dml_data.data[dml_data.cluster_cols[1]],
                                                          dml_data.data[dml_data.z_cols])
    df = dml_data.data.copy().drop(columns='cluster_var_i')
    df.rename(columns={'cluster_var_j': 'cluster_var',
                       'Y': 'y', 'D': 'd', 'Z': 'z'},
              inplace=True)
    assert dml_data_from_array.data.equals(df)
예제 #4
0
def test_make_plr_CCDDHNR2018_return_types():
    np.random.seed(3141)
    res = make_plr_CCDDHNR2018(n_obs=100, return_type=DoubleMLData)
    assert isinstance(res, DoubleMLData)
    res = make_plr_CCDDHNR2018(n_obs=100, return_type=pd.DataFrame)
    assert isinstance(res, pd.DataFrame)
    x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type=np.ndarray)
    assert isinstance(x, np.ndarray)
    assert isinstance(y, np.ndarray)
    assert isinstance(d, np.ndarray)
    with pytest.raises(ValueError, match=msg_inv_return_type):
        _ = make_plr_CCDDHNR2018(n_obs=100, return_type='matrix')
예제 #5
0
def test_d_cols_setter():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLData(df, 'y', ['d1', 'd2'],
                            [f'X{i + 1}' for i in np.arange(7)])

    # check that after changing d_cols, the d array gets updated
    d_comp = dml_data.data['d2'].values
    dml_data.d_cols = ['d2', 'd1']
    assert np.array_equal(dml_data.d, d_comp)

    msg = r'Invalid treatment variable\(s\) d_cols. At least one treatment variable is no data column.'
    with pytest.raises(ValueError, match=msg):
        dml_data.d_cols = ['d1', 'd13']
    with pytest.raises(ValueError, match=msg):
        dml_data.d_cols = 'd13'

    msg = (r'The treatment variable\(s\) d_cols must be of str or list type. '
           "5 of type <class 'int'> was passed.")
    with pytest.raises(TypeError, match=msg):
        dml_data.d_cols = 5

    # check single covariate
    d_comp = dml_data.data['d2'].values
    dml_data.d_cols = 'd2'
    assert np.array_equal(dml_data.d, d_comp)
    assert dml_data.n_treat == 1
예제 #6
0
def test_x_cols_setter():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    orig_x_cols = dml_data.x_cols

    # check that after changing the x_cols, the x array gets updated
    x_comp = dml_data.data[['X1', 'X11', 'X13']].values
    dml_data.x_cols = ['X1', 'X11', 'X13']
    assert np.array_equal(dml_data.x, x_comp)

    msg = 'Invalid covariates x_cols. At least one covariate is no data column.'
    with pytest.raises(ValueError, match=msg):
        dml_data.x_cols = ['X1', 'X11', 'A13']

    msg = (r'The covariates x_cols must be of str or list type \(or None\). '
           "5 of type <class 'int'> was passed.")
    with pytest.raises(TypeError, match=msg):
        dml_data.x_cols = 5

    # check single covariate
    x_comp = dml_data.data[['X13']].values
    dml_data.x_cols = 'X13'
    assert np.array_equal(dml_data.x, x_comp)

    # check setting None brings us back to orig_x_cols
    x_comp = dml_data.data[orig_x_cols].values
    dml_data.x_cols = None
    assert np.array_equal(dml_data.x, x_comp)
예제 #7
0
def test_cluster_cols_setter():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLClusterData(df, 'y', ['d1', 'd2'],
                                   cluster_cols=[f'X{i + 1}' for i in [5, 6]],
                                   x_cols=[f'X{i + 1}' for i in np.arange(5)])

    cluster_vars = df[['X6', 'X7']].values
    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
    assert dml_data.n_cluster_vars == 2

    # check that after changing cluster_cols, the cluster_vars array gets updated
    cluster_vars = df[['X7', 'X6']].values
    dml_data.cluster_cols = ['X7', 'X6']
    assert np.array_equal(dml_data.cluster_vars, cluster_vars)

    msg = r'Invalid cluster variable\(s\) cluster_cols. At least one cluster variable is no data column.'
    with pytest.raises(ValueError, match=msg):
        dml_data.cluster_cols = ['X6', 'X13']
    with pytest.raises(ValueError, match=msg):
        dml_data.cluster_cols = 'X13'

    msg = (r'The cluster variable\(s\) cluster_cols must be of str or list type. '
           "5 of type <class 'int'> was passed.")
    with pytest.raises(TypeError, match=msg):
        dml_data.cluster_cols = 5

    # check single cluster variable
    cluster_vars = df[['X7']].values
    dml_data.cluster_cols = 'X7'
    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
    assert dml_data.n_cluster_vars == 1
예제 #8
0
def test_z_cols_setter():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i + 1}' for i in np.arange(4)] + [f'z{i + 1}' for i in np.arange(3)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLData(df, 'y', ['d1', 'd2'],
                            [f'X{i + 1}' for i in np.arange(4)],
                            [f'z{i + 1}' for i in np.arange(3)])

    # check that after changing z_cols, the z array gets updated
    z_comp = dml_data.data[['z1', 'z2']].values
    dml_data.z_cols = ['z1', 'z2']
    assert np.array_equal(dml_data.z, z_comp)

    msg = r'Invalid instrumental variable\(s\) z_cols. At least one instrumental variable is no data column.'
    with pytest.raises(ValueError, match=msg):
        dml_data.z_cols = ['z1', 'a13']
    with pytest.raises(ValueError, match=msg):
        dml_data.z_cols = 'a13'

    msg = (r'The instrumental variable\(s\) z_cols must be of str or list type \(or None\). '
           "5 of type <class 'int'> was passed.")
    with pytest.raises(TypeError, match=msg):
        dml_data.z_cols = 5

    # check single instrument
    z_comp = dml_data.data[['z2']].values
    dml_data.z_cols = 'z2'
    assert np.array_equal(dml_data.z, z_comp)

    # check None
    dml_data.z_cols = None
    assert dml_data.n_instr == 0
    assert dml_data.z is None
예제 #9
0
def test_add_vars_in_df():
    # additional variables in the df shouldn't affect results
    np.random.seed(3141)
    df = make_plr_CCDDHNR2018(n_obs=100, return_type='DataFrame')
    dml_data_full_df = DoubleMLData(df, 'y', 'd', ['X1', 'X11', 'X13'])
    dml_data_subset = DoubleMLData(df[['X1', 'X11', 'X13'] + ['y', 'd']], 'y', 'd', ['X1', 'X11', 'X13'])
    dml_plr_full_df = DoubleMLPLR(dml_data_full_df, Lasso(), Lasso())
    dml_plr_subset = DoubleMLPLR(dml_data_subset, Lasso(), Lasso(), draw_sample_splitting=False)
    dml_plr_subset.set_sample_splitting(dml_plr_full_df.smpls)
    dml_plr_full_df.fit()
    dml_plr_subset.fit()
    assert np.allclose(dml_plr_full_df.coef, dml_plr_subset.coef, rtol=1e-9, atol=1e-4)
    assert np.allclose(dml_plr_full_df.se, dml_plr_subset.se, rtol=1e-9, atol=1e-4)
예제 #10
0
def test_use_other_treat_as_covariate():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'd1', 'd2']
    dml_data = DoubleMLData(df,
                            'y', ['d1', 'd2'],
                            [f'X{i + 1}' for i in np.arange(7)],
                            use_other_treat_as_covariate=True)
    dml_data.set_x_d('d1')
    assert np.array_equal(dml_data.d, df['d1'].values)
    assert np.array_equal(
        dml_data.x, df[[f'X{i + 1}' for i in np.arange(7)] + ['d2']].values)
    dml_data.set_x_d('d2')
    assert np.array_equal(dml_data.d, df['d2'].values)
    assert np.array_equal(
        dml_data.x, df[[f'X{i + 1}' for i in np.arange(7)] + ['d1']].values)

    dml_data = DoubleMLData(df,
                            'y', ['d1', 'd2'],
                            [f'X{i + 1}' for i in np.arange(7)],
                            use_other_treat_as_covariate=False)
    dml_data.set_x_d('d1')
    assert np.array_equal(dml_data.d, df['d1'].values)
    assert np.array_equal(dml_data.x,
                          df[[f'X{i + 1}' for i in np.arange(7)]].values)
    dml_data.set_x_d('d2')
    assert np.array_equal(dml_data.d, df['d2'].values)
    assert np.array_equal(dml_data.x,
                          df[[f'X{i + 1}' for i in np.arange(7)]].values)

    msg = 'use_other_treat_as_covariate must be True or False. Got 1.'
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLData(df,
                         'y', ['d1', 'd2'],
                         [f'X{i + 1}' for i in np.arange(7)],
                         use_other_treat_as_covariate=1)

    msg = 'Invalid treatment_var. d3 is not in d_cols.'
    with pytest.raises(ValueError, match=msg):
        dml_data.set_x_d('d3')

    msg = r"treatment_var must be of str type. \['d1', 'd2'\] of type <class 'list'> was passed."
    with pytest.raises(TypeError, match=msg):
        dml_data.set_x_d(['d1', 'd2'])
예제 #11
0
def test_y_col_setter():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    df = dml_data.data.copy().iloc[:, :10]
    df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'y123', 'd']
    dml_data = DoubleMLData(df, 'y', 'd', [f'X{i + 1}' for i in np.arange(7)])

    # check that after changing y_col, the y array gets updated
    y_comp = dml_data.data['y123'].values
    dml_data.y_col = 'y123'
    assert np.array_equal(dml_data.y, y_comp)

    msg = r'Invalid outcome variable y_col. d13 is no data column.'
    with pytest.raises(ValueError, match=msg):
        dml_data.y_col = 'd13'

    msg = (r'The outcome variable y_col must be of str type. '
           "5 of type <class 'int'> was passed.")
    with pytest.raises(TypeError, match=msg):
        dml_data.y_col = 5
예제 #12
0
def test_duplicates():
    np.random.seed(3141)
    dml_data = make_plr_CCDDHNR2018(n_obs=100)
    dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)

    msg = r'Invalid treatment variable\(s\) d_cols: Contains duplicate values.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData(dml_data.data, y_col='y', d_cols=['d', 'd', 'X1'], x_cols=['X3', 'X2'])
    with pytest.raises(ValueError, match=msg):
        dml_data.d_cols = ['d', 'd', 'X1']

    msg = 'Invalid covariates x_cols: Contains duplicate values.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData(dml_data.data, y_col='y', d_cols=['d'], x_cols=['X3', 'X2', 'X3'])
    with pytest.raises(ValueError, match=msg):
        dml_data.x_cols = ['X3', 'X2', 'X3']

    msg = r'Invalid instrumental variable\(s\) z_cols: Contains duplicate values.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData(dml_data.data, y_col='y', d_cols=['d'], x_cols=['X3', 'X2'],
                         z_cols=['X15', 'X12', 'X12', 'X15'])
    with pytest.raises(ValueError, match=msg):
        dml_data.z_cols = ['X15', 'X12', 'X12', 'X15']

    msg = r'Invalid cluster variable\(s\) cluster_cols: Contains duplicate values.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLClusterData(dml_cluster_data.data, y_col='y', d_cols=['d'], cluster_cols=['X3', 'X2', 'X3'])
    with pytest.raises(ValueError, match=msg):
        dml_cluster_data.cluster_cols = ['X3', 'X2', 'X3']

    msg = 'Invalid pd.DataFrame: Contains duplicate column names.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLData(pd.DataFrame(np.zeros((100, 5)), columns=['y', 'd', 'X3', 'X2', 'y']),
                         y_col='y', d_cols=['d'], x_cols=['X3', 'X2'])
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLClusterData(pd.DataFrame(np.zeros((100, 5)), columns=['y', 'd', 'X3', 'X2', 'y']),
                                y_col='y', d_cols=['d'], cluster_cols=['X2'])
import pytest
import pandas as pd
import numpy as np

from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData
from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data,\
    make_pliv_multiway_cluster_CKMS2021

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.base import BaseEstimator

np.random.seed(3141)
dml_data = make_plr_CCDDHNR2018(n_obs=10)
ml_g = Lasso()
ml_m = Lasso()
ml_r = Lasso()
dml_plr = DoubleMLPLR(dml_data, ml_g, ml_m)

dml_data_irm = make_irm_data(n_obs=10)
dml_data_iivm = make_iivm_data(n_obs=10)
dml_data_pliv = make_pliv_CHS2015(n_obs=10, dim_z=1)
dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
(x, y, d, z) = make_iivm_data(n_obs=30, return_type="array")
y[y > 0] = 1
y[y < 0] = 0
dml_data_irm_binary_outcome = DoubleMLData.from_arrays(x, y, d)
dml_data_iivm_binary_outcome = DoubleMLData.from_arrays(x, y, d, z)


@pytest.mark.ci
def test_doubleml_exception_data():
예제 #14
0
import pytest
import pandas as pd
import numpy as np

from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV
from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data

from sklearn.linear_model import Lasso, LogisticRegression

np.random.seed(3141)
dml_data_plr = make_plr_CCDDHNR2018(n_obs=100)
dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1)
dml_data_irm = make_irm_data(n_obs=100)
dml_data_iivm = make_iivm_data(n_obs=100)

dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso())
dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso())
dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(),
                        LogisticRegression())


@pytest.mark.ci
@pytest.mark.parametrize('dml_obj, cls', [(dml_plr, DoubleMLPLR),
                                          (dml_pliv, DoubleMLPLIV),
                                          (dml_irm, DoubleMLIRM),
                                          (dml_iivm, DoubleMLIIVM)])
def test_plr_return_types(dml_obj, cls):
    # ToDo: A second test case with multiple treatment variables would be helpful
    assert isinstance(dml_obj.__str__(), str)
    assert isinstance(dml_obj.summary, pd.DataFrame)