def test_doubleml_exception_scores():
    msg = 'Invalid score IV. Valid score IV-type or partialling out.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLPLR(dml_data, ml_g, ml_m, score='IV')
    msg = 'score should be either a string or a callable. 0 was passed.'
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLPLR(dml_data, ml_g, ml_m, score=0)

    msg = 'Invalid score IV. Valid score ATE or ATTE.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), score='IV')
    msg = 'score should be either a string or a callable. 0 was passed.'
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), score=0)

    msg = 'Invalid score ATE. Valid score LATE.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), score='ATE')
    msg = 'score should be either a string or a callable. 0 was passed.'
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), score=0)

    msg = 'Invalid score IV. Valid score partialling out.'
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso(), score='IV')
    msg = 'score should be either a string or a callable. 0 was passed.'
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso(), score=0)
def test_pliv_callable_not_implemented():
    np.random.seed(3141)
    dml_data_pliv_2z = make_pliv_CHS2015(n_obs=100, dim_z=2)
    pliv_score = dml_pliv._score_elements

    dml_pliv_callable_score = DoubleMLPLIV._partialX(dml_data_pliv_2z,
                                                     Lasso(),
                                                     Lasso(),
                                                     Lasso(),
                                                     score=pliv_score)
    msg = 'Callable score not implemented for DoubleMLPLIV.partialX with several instruments.'
    with pytest.raises(NotImplementedError, match=msg):
        dml_pliv_callable_score.fit()

    dml_pliv_callable_score = DoubleMLPLIV._partialZ(dml_data_pliv_2z,
                                                     Lasso(),
                                                     score=pliv_score)
    msg = 'Callable score not implemented for DoubleMLPLIV.partialZ.'
    with pytest.raises(NotImplementedError, match=msg):
        dml_pliv_callable_score.fit()

    dml_pliv_callable_score = DoubleMLPLIV._partialXZ(dml_data_pliv_2z,
                                                      Lasso(),
                                                      Lasso(),
                                                      Lasso(),
                                                      score=pliv_score)
    msg = 'Callable score not implemented for DoubleMLPLIV.partialXZ.'
    with pytest.raises(NotImplementedError, match=msg):
        dml_pliv_callable_score.fit()
def test_doubleml_cluster_not_yet_implemented():
    dml_pliv_cluster = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r)
    dml_pliv_cluster.fit()
    msg = 'bootstrap not yet implemented with clustering.'
    with pytest.raises(NotImplementedError, match=msg):
        _ = dml_pliv_cluster.bootstrap()

    smpls = dml_plr.smpls
    msg = ('Externally setting the sample splitting for DoubleML is '
           'not yet implemented with clustering.')
    with pytest.raises(NotImplementedError, match=msg):
        _ = dml_pliv_cluster.set_sample_splitting(smpls)

    df = dml_cluster_data_pliv.data.copy()
    df['cluster_var_k'] = df['cluster_var_i'] + df['cluster_var_j'] - 2
    dml_cluster_data_multiway = DoubleMLClusterData(df, y_col='Y', d_cols='D', x_cols=['X1', 'X5'], z_cols='Z',
                                                    cluster_cols=['cluster_var_i', 'cluster_var_j', 'cluster_var_k'])
    assert dml_cluster_data_multiway.n_cluster_vars == 3
    msg = r'Multi-way \(n_ways > 2\) clustering not yet implemented.'
    with pytest.raises(NotImplementedError, match=msg):
        _ = DoubleMLPLIV(dml_cluster_data_multiway, ml_g, ml_m, ml_r)

    msg = (r'No cross-fitting \(`apply_cross_fitting = False`\) '
           'is not yet implemented with clustering.')
    with pytest.raises(NotImplementedError, match=msg):
        _ = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r,
                         n_folds=1)
    with pytest.raises(NotImplementedError, match=msg):
        _ = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r,
                         apply_cross_fitting=False, n_folds=2)
def test_doubleml_exception_smpls():
    msg = ('Sample splitting not specified. '
           r'Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\).')
    dml_plr_no_smpls = DoubleMLPLR(dml_data, ml_g, ml_m, draw_sample_splitting=False)
    with pytest.raises(ValueError, match=msg):
        _ = dml_plr_no_smpls.smpls
    msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().'
    dml_pliv_cluster_no_smpls = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r, draw_sample_splitting=False)
    with pytest.raises(ValueError, match=msg):
        _ = dml_pliv_cluster_no_smpls.smpls_cluster
    with pytest.raises(ValueError, match=msg):
        _ = dml_pliv_cluster_no_smpls.smpls
def test_pliv_callable_vs_str_score():
    pliv_score = dml_pliv._score_elements
    dml_pliv_callable_score = DoubleMLPLIV(dml_data_pliv,
                                           Lasso(),
                                           Lasso(),
                                           Lasso(),
                                           score=pliv_score,
                                           draw_sample_splitting=False)
    dml_pliv_callable_score.set_sample_splitting(dml_pliv.smpls)
    dml_pliv_callable_score.fit()
    assert np.allclose(dml_pliv.psi,
                       dml_pliv_callable_score.psi,
                       rtol=1e-9,
                       atol=1e-4)
    assert np.allclose(dml_pliv.coef,
                       dml_pliv_callable_score.coef,
                       rtol=1e-9,
                       atol=1e-4)
def test_doubleml_exception_data():
    msg = 'The data must be of DoubleMLData type.'
    with pytest.raises(TypeError, match=msg):
        _ = DoubleMLPLR(pd.DataFrame(), ml_g, ml_m)

    # PLR with IV
    msg = (r'Incompatible data. Z1 have been set as instrumental variable\(s\). '
           'To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR.')
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLPLR(dml_data_pliv, ml_g, ml_m)

    # PLIV without IV
    msg = ('Incompatible data. '
           'At least one variable must be set as instrumental variable. '
           r'To fit a partially linear regression model without instrumental variable\(s\) '
           'use DoubleMLPLR instead of DoubleMLPLIV.')
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLPLIV(dml_data, Lasso(), Lasso(), Lasso())

    # IRM with IV
    msg = (r'Incompatible data. z have been set as instrumental variable\(s\). '
           'To fit an interactive IV regression model use DoubleMLIIVM instead of DoubleMLIRM.')
    with pytest.raises(ValueError, match=msg):
        _ = DoubleMLIRM(dml_data_iivm, Lasso(), LogisticRegression())
    msg = ('Incompatible data. To fit an IRM model with DML exactly one binary variable with values 0 and 1 '
           'needs to be specified as treatment variable.')
    df_irm = dml_data_irm.data.copy()
    df_irm['d'] = df_irm['d']*2
    with pytest.raises(ValueError, match=msg):
        # non-binary D for IRM
        _ = DoubleMLIRM(DoubleMLData(df_irm, 'y', 'd'),
                        Lasso(), LogisticRegression())
    df_irm = dml_data_irm.data.copy()
    with pytest.raises(ValueError, match=msg):
        # multiple D for IRM
        _ = DoubleMLIRM(DoubleMLData(df_irm, 'y', ['d', 'X1']),
                        Lasso(), LogisticRegression())

    msg = ('Incompatible data. To fit an IIVM model with DML exactly one binary variable with values 0 and 1 '
           'needs to be specified as treatment variable.')
    df_iivm = dml_data_iivm.data.copy()
    df_iivm['d'] = df_iivm['d'] * 2
    with pytest.raises(ValueError, match=msg):
        # non-binary D for IIVM
        _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols='z'),
                         Lasso(), LogisticRegression(), LogisticRegression())
    df_iivm = dml_data_iivm.data.copy()
    with pytest.raises(ValueError, match=msg):
        # multiple D for IIVM
        _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', ['d', 'X1'], z_cols='z'),
                         Lasso(), LogisticRegression(), LogisticRegression())

    msg = ('Incompatible data. To fit an IIVM model with DML exactly one binary variable with values 0 and 1 '
           'needs to be specified as instrumental variable.')
    with pytest.raises(ValueError, match=msg):
        # IIVM without IV
        _ = DoubleMLIIVM(dml_data_irm,
                         Lasso(), LogisticRegression(), LogisticRegression())
    df_iivm = dml_data_iivm.data.copy()
    df_iivm['z'] = df_iivm['z'] * 2
    with pytest.raises(ValueError, match=msg):
        # non-binary Z for IIVM
        _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols='z'),
                         Lasso(), LogisticRegression(), LogisticRegression())
    df_iivm = dml_data_iivm.data.copy()
    with pytest.raises(ValueError, match=msg):
        # multiple Z for IIVM
        _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols=['z', 'X1']),
                         Lasso(), LogisticRegression(), LogisticRegression())
示例#7
0
import pandas as pd
import numpy as np

from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV
from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data

from sklearn.linear_model import Lasso, LogisticRegression

np.random.seed(3141)
dml_data_plr = make_plr_CCDDHNR2018(n_obs=100)
dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1)
dml_data_irm = make_irm_data(n_obs=100)
dml_data_iivm = make_iivm_data(n_obs=100)

dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso())
dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso())
dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(),
                        LogisticRegression())


@pytest.mark.ci
@pytest.mark.parametrize('dml_obj, cls', [(dml_plr, DoubleMLPLR),
                                          (dml_pliv, DoubleMLPLIV),
                                          (dml_irm, DoubleMLIRM),
                                          (dml_iivm, DoubleMLIIVM)])
def test_plr_return_types(dml_obj, cls):
    # ToDo: A second test case with multiple treatment variables would be helpful
    assert isinstance(dml_obj.__str__(), str)
    assert isinstance(dml_obj.summary, pd.DataFrame)
    assert isinstance(dml_obj.draw_sample_splitting(), cls)
import pytest
import numpy as np

from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV
from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data

from sklearn.linear_model import Lasso, LogisticRegression

np.random.seed(3141)
dml_data_plr = make_plr_CCDDHNR2018(n_obs=100)
dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1)
dml_data_irm = make_irm_data(n_obs=100)
dml_data_iivm = make_iivm_data(n_obs=100)

dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso())
dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso())
dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(),
                        LogisticRegression())


def _assert_resampling_default_settings(dml_obj):
    assert dml_obj.n_folds == 5
    assert dml_obj.n_rep == 1
    assert dml_obj.draw_sample_splitting
    assert dml_obj.apply_cross_fitting


@pytest.mark.ci
def test_plr_defaults():
    _assert_resampling_default_settings(dml_plr)
示例#9
0
# %%
# Initialize the objects of class DoubleMLData and DoubleMLPLIV
# -------------------------------------------------------------

# Set machine learning methods for m & g
learner = RandomForestRegressor(max_depth=2, n_estimators=10)
ml_g = clone(learner)
ml_m = clone(learner)
ml_r = clone(learner)

# initialize the DoubleMLPLIV object
dml_pliv_obj = DoubleMLPLIV(obj_dml_data,
                            ml_g,
                            ml_m,
                            ml_r,
                            score='partialling out',
                            dml_procedure='dml1',
                            draw_sample_splitting=False)

# %%
# Split samples and transfer the sample splitting to the object
# -------------------------------------------------------------

K = 3  # number of folds
smpl_sizes = [N, M]
obj_dml_multiway_resampling = DoubleMLMultiwayResampling(K, smpl_sizes)
smpls_multi_ind, smpls_lin_ind = obj_dml_multiway_resampling.split_samples()

dml_pliv_obj.set_sample_splitting([smpls_lin_ind])