示例#1
0
def test_transform():
    data_name = "adult"
    transformed_data = OnlineCatalog(
        data_name, scaling_method="MinMax", encoding_method="OneHot_drop_binary"
    )
    raw_data = OnlineCatalog(
        data_name, scaling_method="Identity", encoding_method="Identity"
    )

    # sort columns as order could be different
    assert_frame_equal(
        transformed_data.inverse_transform(transformed_data.df).sort_index(axis=1),
        raw_data.df.sort_index(axis=1),
        check_dtype=False,
    )
    assert_frame_equal(
        transformed_data.transform(raw_data.df).sort_index(axis=1),
        transformed_data.df.sort_index(axis=1),
        check_dtype=False,
    )
    assert_frame_equal(
        transformed_data.transform(
            transformed_data.inverse_transform(transformed_data.df)
        ).sort_index(axis=1),
        transformed_data.df.sort_index(axis=1),
        check_dtype=False,
    )
    assert_frame_equal(
        transformed_data.inverse_transform(
            transformed_data.transform(raw_data.df)
        ).sort_index(axis=1),
        raw_data.df.sort_index(axis=1),
        check_dtype=False,
    )
示例#2
0
def test_properties():
    data_name = "adult"
    data = OnlineCatalog(data_name)

    model_tf_adult = MLModelCatalog(data, "ann", backend="tensorflow")

    exp_backend_tf = "tensorflow"
    exp_feature_order_adult = [
        "age",
        "fnlwgt",
        "education-num",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "workclass_Private",
        "marital-status_Non-Married",
        "occupation_Other",
        "relationship_Non-Husband",
        "race_White",
        "sex_Male",
        "native-country_US",
    ]

    assert model_tf_adult.backend == exp_backend_tf
    assert model_tf_adult.feature_input_order == exp_feature_order_adult
示例#3
0
def test_predictions_with_pipeline(model_type, data_name):
    data = OnlineCatalog(data_name)

    model_tf_adult = MLModelCatalog(data, model_type, backend="tensorflow")
    model_tf_adult.use_pipeline = True

    single_sample = data.df.iloc[22].to_frame().T
    samples = data.df.iloc[0:22]

    # Test single and bulk non probabilistic predictions
    single_prediction_tf = model_tf_adult.predict(single_sample)
    expected_shape = tuple((1, 1))
    assert single_prediction_tf.shape == expected_shape

    predictions_tf = model_tf_adult.predict(samples)
    expected_shape = tuple((22, 1))
    assert predictions_tf.shape == expected_shape

    # Test single and bulk probabilistic predictions
    single_predict_proba_tf = model_tf_adult.predict_proba(single_sample)
    expected_shape = tuple((1, 2))
    assert single_predict_proba_tf.shape == expected_shape

    predictions_proba_tf = model_tf_adult.predict_proba(samples)
    expected_shape = tuple((22, 2))
    assert predictions_proba_tf.shape == expected_shape
示例#4
0
def test_predictions_pt(model_type, data_name):
    data = OnlineCatalog(data_name)
    model = _train_model(data, model_type, backend="pytorch")

    single_sample = data.df.iloc[[22]]
    samples = data.df.iloc[0:22]

    # Test single non probabilistic predictions
    single_prediction = model.predict(single_sample)
    expected_shape = tuple((1, 1))
    assert single_prediction.shape == expected_shape
    assert isinstance(single_prediction, np.ndarray)

    # bulk non probabilistic predictions
    predictions = model.predict(samples)
    expected_shape = tuple((22, 1))
    assert predictions.shape == expected_shape
    assert isinstance(predictions, np.ndarray)

    # Test single probabilistic predictions
    single_predict_proba = model.predict_proba(single_sample)
    expected_shape = tuple((1, 2))
    assert single_predict_proba.shape == expected_shape
    assert isinstance(single_predict_proba, np.ndarray)

    # bulk probabilistic predictions
    predictions_proba = model.predict_proba(samples)
    expected_shape = tuple((22, 2))
    assert predictions_proba.shape == expected_shape
    assert isinstance(single_predict_proba, np.ndarray)
示例#5
0
def test_forest(backend):
    data_name = "give_me_some_credit"
    model_type = "forest"

    data = OnlineCatalog(data_name)
    model = _train_model(data, model_type, backend=backend)

    single_sample = data.df.iloc[[22]]
    samples = data.df.iloc[0:22]

    # Test single and bulk non-probabilistic predictions
    single_prediction = model.predict(single_sample)
    expected_shape = tuple((1, ))
    assert single_prediction.shape == expected_shape

    predictions = model.predict(samples)
    expected_shape = tuple((22, ))
    assert predictions.shape == expected_shape

    # Test single and bulk probabilistic predictions
    single_predict_proba = model.predict_proba(single_sample)
    expected_shape = tuple((1, 2))
    assert single_predict_proba.shape == expected_shape

    predictions_proba = model.predict_proba(samples)
    expected_shape = tuple((22, 2))
    assert predictions_proba.shape == expected_shape
示例#6
0
def test_forest_properties():
    data_name = "adult"
    data = OnlineCatalog(data_name)

    model = MLModelCatalog(data, "forest", backend="sklearn")

    assert model is not None
示例#7
0
def test_cs_vae():
    # Build data and mlmodel
    data_name = "adult"
    data = OnlineCatalog(data_name)

    model = MLModelCatalog(data, "ann", backend="pytorch")

    test_input = np.zeros((1, 13))
    test_input = torch.Tensor(test_input)
    test_class = torch.Tensor(np.array([[0, 0]]))

    csvae = CSVAE(data_name, layers=[11, 16, 8], mutable_mask=model.get_mutable_mask())

    csvae.fit(data=data.df[model.feature_input_order + [data.target]], epochs=1)

    output = csvae.predict(test_input, test_class)
    test_reconstructed = output[0]

    assert test_reconstructed.shape == test_input.shape

    # test loading vae
    new_csvae = CSVAE(
        data_name, layers=[11, 16, 8], mutable_mask=model.get_mutable_mask()
    )

    new_csvae.load(11)
示例#8
0
def test_predictions_tf(model_type, data_name):
    data = OnlineCatalog(data_name)

    model_tf_adult = MLModelCatalog(data, model_type, backend="tensorflow")

    single_sample = data.df.iloc[22]
    single_sample = single_sample[model_tf_adult.feature_input_order].values.reshape(
        (1, -1)
    )
    samples = data.df.iloc[0:22]
    samples = samples[model_tf_adult.feature_input_order].values

    # Test single and bulk non probabilistic predictions
    single_prediction_tf = model_tf_adult.predict(single_sample)
    expected_shape = tuple((1, 1))
    assert single_prediction_tf.shape == expected_shape

    predictions_tf = model_tf_adult.predict(samples)
    expected_shape = tuple((22, 1))
    assert predictions_tf.shape == expected_shape

    # Test single and bulk probabilistic predictions
    single_predict_proba_tf = model_tf_adult.predict_proba(single_sample)
    expected_shape = tuple((1, 2))
    assert single_predict_proba_tf.shape == expected_shape

    predictions_proba_tf = model_tf_adult.predict_proba(samples)
    expected_shape = tuple((22, 2))
    assert predictions_proba_tf.shape == expected_shape
示例#9
0
def test_variational_autoencoder():
    # Build data and mlmodel
    data_name = "adult"
    data = OnlineCatalog(data_name)

    model = MLModelCatalog(data, "ann", backend="pytorch")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    test_input = np.zeros((1, 13))
    test_input = torch.Tensor(test_input).to(device)

    vae = VariationalAutoencoder(
        data_name, layers=[11, 512, 256, 8], mutable_mask=model.get_mutable_mask()
    )

    vae.fit(xtrain=data.df[model.feature_input_order])

    test_reconstructed, _, _ = vae.predict(test_input)

    assert test_reconstructed.shape == test_input.shape

    # test loading vae
    new_vae = VariationalAutoencoder(
        data_name, layers=[11, 512, 256, 8], mutable_mask=model.get_mutable_mask()
    )

    new_vae.load(11)
示例#10
0
def test_data():
    data_name = "adult"
    data_catalog = OnlineCatalog(data_name)

    assert issubclass(OnlineCatalog, DataCatalog)
    assert issubclass(DataCatalog, Data)
    assert isinstance(data_catalog, Data)
    assert issubclass(Data, ABC)
示例#11
0
def test_mlmodel(model_type):
    data_name = "adult"
    data = OnlineCatalog(data_name)

    model_catalog = MLModelCatalog(data, model_type, backend="tensorflow")

    assert issubclass(MLModelCatalog, MLModel)
    assert isinstance(model_catalog, MLModel)
    assert issubclass(MLModel, ABC)
示例#12
0
def test_get_tree():
    data_name = "adult"
    data = OnlineCatalog(data_name)
    model = MLModelCatalog(data, "forest", "xgboost")
    booster = model.tree_iterator[0]

    tree = _get_tree_from_booster(booster)

    assert isinstance(tree, list)
    assert isinstance(tree[0], str)
示例#13
0
def test_predictions_pt(model_type, data_name):
    data = OnlineCatalog(data_name)
    model = MLModelCatalog(data, model_type, backend="pytorch")

    single_sample = data.df.iloc[22]
    single_sample = single_sample[model.feature_input_order].values.reshape((1, -1))
    single_sample_torch = torch.Tensor(single_sample)

    samples = data.df.iloc[0:22]
    samples = samples[model.feature_input_order].values
    samples_torch = torch.Tensor(samples)

    # Test single non probabilistic predictions
    single_prediction = model.predict(single_sample)
    expected_shape = tuple((1, 1))
    assert single_prediction.shape == expected_shape
    assert isinstance(single_prediction, np.ndarray)

    single_prediction_torch = model.predict(single_sample_torch)
    expected_shape = tuple((1, 1))
    assert single_prediction_torch.shape == expected_shape
    assert torch.is_tensor(single_prediction_torch)

    # bulk non probabilistic predictions
    predictions = model.predict(samples)
    expected_shape = tuple((22, 1))
    assert predictions.shape == expected_shape
    assert isinstance(predictions, np.ndarray)

    predictions_torch = model.predict(samples_torch)
    expected_shape = tuple((22, 1))
    assert predictions_torch.shape == expected_shape
    assert torch.is_tensor(predictions_torch)

    # Test single probabilistic predictions
    single_predict_proba = model.predict_proba(single_sample)
    expected_shape = tuple((1, 2))
    assert single_predict_proba.shape == expected_shape
    assert isinstance(single_predict_proba, np.ndarray)

    single_predict_proba_torch = model.predict_proba(single_sample_torch)
    expected_shape = tuple((1, 2))
    assert single_predict_proba_torch.shape == expected_shape
    assert torch.is_tensor(single_predict_proba_torch)

    # bulk probabilistic predictions
    predictions_proba = model.predict_proba(samples)
    expected_shape = tuple((22, 2))
    assert predictions_proba.shape == expected_shape
    assert isinstance(single_predict_proba, np.ndarray)

    predictions_proba_torch = model.predict_proba(samples_torch)
    expected_shape = tuple((22, 2))
    assert predictions_proba_torch.shape == expected_shape
    assert torch.is_tensor(predictions_proba_torch)
示例#14
0
def test_adult_col(data_name):
    data_catalog = OnlineCatalog(data_name)

    actual_col = (
        data_catalog.categorical + data_catalog.continuous + [data_catalog.target]
    )
    actual_col = actual_col.sort()
    expected_col = data_catalog.df.columns.values
    expected_col = expected_col.sort()

    assert actual_col == expected_col
示例#15
0
def test_cfmodel():
    data_name = "adult"
    data_catalog = OnlineCatalog(data_name)

    hyperparams = {"num": 1, "desired_class": 1}
    model_catalog = MLModelCatalog(data_catalog, "ann", backend="tensorflow")

    dice = Dice(model_catalog, hyperparams)

    assert issubclass(Dice, RecourseMethod)
    assert isinstance(dice, RecourseMethod)
    assert issubclass(RecourseMethod, ABC)
示例#16
0
def test_autoencoder():
    # Build data and mlmodel
    data_name = "adult"
    data = OnlineCatalog(data_name)

    model = MLModelCatalog(data, "ann", backend="tensorflow")
    test_input = tf.Variable(np.zeros((1, 13)), dtype=tf.float32)

    ae = Autoencoder(data_name, [len(model.feature_input_order), 20, 10, 5])
    fitted_ae = train_autoencoder(
        ae,
        data,
        model.feature_input_order,
        epochs=5,
        save=False,
    )
    test_output = fitted_ae(test_input)

    expected_shape = (1, 13)
    assert test_output.shape == expected_shape

    # test with different lengths
    ae = Autoencoder(data_name, [len(model.feature_input_order), 5])
    fitted_ae = train_autoencoder(
        ae,
        data,
        model.feature_input_order,
        epochs=5,
        save=False,
    )
    test_output = fitted_ae(test_input)

    expected_shape = (1, 13)
    assert test_output.shape == expected_shape

    # test with different loss function
    def custom_loss(y_true, y_pred):
        return K.max(y_true - y_pred)

    ae = Autoencoder(
        data_name, [len(model.feature_input_order), 20, 15, 10, 8, 5], loss=custom_loss
    )
    fitted_ae = train_autoencoder(
        ae,
        data,
        model.feature_input_order,
        epochs=5,
        save=False,
    )
    test_output = fitted_ae(test_input)

    expected_shape = (1, 13)
    assert test_output.shape == expected_shape
示例#17
0
def test_parse_booster():
    data_name = "adult"
    data = OnlineCatalog(data_name)
    model = MLModelCatalog(data, "forest", "xgboost")
    tree = model.tree_iterator[0]

    children_left, children_right, thresholds, features, scores = parse_booster(
        tree)

    assert len(children_left) > 0
    assert len(children_right) > 0
    assert len(thresholds) > 0
    assert len(features) > 0
    assert len(scores) > 0
示例#18
0
def make_benchmark(data_name="adult", model_name="ann"):
    # get data and mlmodel
    data = OnlineCatalog(data_name)
    model = MLModelCatalog(data, model_name, backend="tensorflow")

    # get factuals
    factuals = predict_negative_instances(model, data.df)
    test_factual = factuals.iloc[:5]

    # get recourse method
    hyperparams = {"num": 1, "desired_class": 1}
    recourse_method = Dice(model, hyperparams)

    # make benchmark object
    benchmark = Benchmark(model, recourse_method, test_factual)

    return benchmark
示例#19
0
def test_parse_node():
    data_name = "adult"
    data = OnlineCatalog(data_name)
    model = MLModelCatalog(data, "forest", "xgboost")
    booster = model.tree_iterator[0]

    tree = _get_tree_from_booster(booster)

    leaf_str = None
    node_str = None
    for node in tree:
        if "leaf" in node and leaf_str is None:
            leaf_str = node
        elif "leaf" not in node and node_str is None:
            node_str = node

    (
        node_id,
        threshold,
        feature,
        left_child,
        right_child,
        score,
    ) = _parse_node(node_str)

    assert threshold != TREE_UNDEFINED
    assert feature != TREE_UNDEFINED
    assert left_child != TREE_LEAF
    assert right_child != TREE_LEAF
    assert score is None

    (
        node_id,
        threshold,
        feature,
        left_child,
        right_child,
        score,
    ) = _parse_node(leaf_str)

    assert threshold == TREE_UNDEFINED
    assert feature == TREE_UNDEFINED
    assert left_child == TREE_LEAF
    assert right_child == TREE_LEAF
    assert 0 <= score <= 1
示例#20
0
def test_predictions_tf(model_type, data_name):
    data = OnlineCatalog(data_name)
    model = _train_model(data, model_type, backend="tensorflow")

    single_sample = data.df.iloc[[22]]
    samples = data.df.iloc[0:22]

    # Test single and bulk non-probabilistic predictions
    single_prediction_tf = model.predict(single_sample)
    expected_shape = tuple((1, 1))
    assert single_prediction_tf.shape == expected_shape

    predictions_tf = model.predict(samples)
    expected_shape = tuple((22, 1))
    assert predictions_tf.shape == expected_shape

    # Test single and bulk probabilistic predictions
    single_predict_proba_tf = model.predict_proba(single_sample)
    expected_shape = tuple((1, 2))
    assert single_predict_proba_tf.shape == expected_shape

    predictions_proba_tf = model.predict_proba(samples)
    expected_shape = tuple((22, 2))
    assert predictions_proba_tf.shape == expected_shape
示例#21
0
def test_save_and_load():
    with tf.Session() as sess:
        # Build data and mlmodel
        data_name = "adult"
        data = OnlineCatalog(data_name)

        model = MLModelCatalog(data, "ann", backend="tensorflow")
        test_input = tf.Variable(np.zeros((1, 13)), dtype=tf.float32)

        ae = Autoencoder(data_name, [len(model.feature_input_order), 20, 10, 5])
        fitted_ae = train_autoencoder(
            ae,
            data,
            model.feature_input_order,
            epochs=5,
            save=True,
        )

        expected = fitted_ae(test_input)

        loaded_ae = Autoencoder(data_name).load(len(model.feature_input_order))
        actual = loaded_ae(test_input)

        assert (actual.eval(session=sess) == expected.eval(session=sess)).all()
示例#22
0
)
args = parser.parse_args()
setup = load_setup()

results = pd.DataFrame()

path = args.path

session_models = ["cem", "cem-vae"]
torch_methods = ["cchvae", "clue", "cruds", "wachter", "revise"]
for rm in args.recourse_method:
    backend = "tensorflow"
    if rm in torch_methods:
        backend = "pytorch"
    for data_name in args.dataset:
        dataset = OnlineCatalog(data_name)
        for model_type in args.type:
            log.info("=====================================")
            log.info("Recourse method: {}".format(rm))
            log.info("Dataset: {}".format(data_name))
            log.info("Model type: {}".format(model_type))

            if rm in session_models:
                graph = Graph()
                with graph.as_default():
                    ann_sess = Session()
                    with ann_sess.as_default():
                        mlmodel_sess = MLModelCatalog(dataset, model_type,
                                                      backend)

                        factuals_sess = predict_negative_instances(
示例#23
0
def test_constraint_violations():
    # Build data and mlmodel
    data_name = "adult"
    data = OnlineCatalog(data_name)

    # get factuals
    columns = [
        "age",
        "workclass",
        "fnlwgt",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "income",
    ]
    test_factual = [
        [
            39,
            "Non-Private",
            77516,
            13,
            "Non-Married",
            "Managerial-Specialist",
            "Non-Husband",
            "White",
            "Male",
            2174,
            0,
            40,
            "US",
            0,
        ],
        [
            50,
            "Non-Private",
            83311,
            13,
            "Married",
            "Managerial-Specialist",
            "Husband",
            "White",
            "Male",
            0,
            0,
            13,
            "US",
            0,
        ],
        [
            38,
            "Private",
            215646,
            9,
            "Non-Married",
            "Other",
            "Non-Husband",
            "White",
            "Male",
            0,
            0,
            40,
            "US",
            0,
        ],
        [
            53,
            "Private",
            234721,
            7,
            "Married",
            "Other",
            "Husband",
            "Non-White",
            "Male",
            0,
            0,
            40,
            "US",
            0,
        ],
        [
            28,
            "Private",
            338409,
            13,
            "Married",
            "Managerial-Specialist",
            "Non-Husband",
            "Non-White",
            "Female",
            0,
            0,
            40,
            "Non-US",
            0,
        ],
    ]
    test_factual = pd.DataFrame(
        test_factual,
        columns=columns,
    )

    test_counterfactual = [
        [
            45,
            "Non-Private",
            77516,
            13,
            "Non-Married",
            "Managerial-Specialist",
            "Non-Husband",
            "White",
            "Female",
            2174,
            0,
            40,
            "US",
            0,
        ],
        [
            50,
            "Non-Private",
            83311,
            13,
            "Married",
            "Managerial-Specialist",
            "Husband",
            "White",
            "Male",
            0,
            0,
            13,
            "US",
            0,
        ],
        [
            18,
            "Private",
            215646,
            9,
            "Non-Married",
            "Other",
            "Non-Husband",
            "White",
            "Male",
            0,
            0,
            40,
            "US",
            0,
        ],
        [
            53,
            "Private",
            234721,
            7,
            "Married",
            "Other",
            "Husband",
            "Non-White",
            "Male",
            0,
            0,
            40,
            "US",
            0,
        ],
        [
            28,
            "Private",
            338409,
            13,
            "Married",
            "Managerial-Specialist",
            "Non-Husband",
            "Non-White",
            "Male",
            0,
            0,
            40,
            "Non-US",
            0,
        ],
    ]
    test_counterfactual = pd.DataFrame(
        test_counterfactual,
        columns=columns,
    )
    test_counterfactual = data.transform(test_counterfactual)
    test_factual = data.transform(test_factual)

    expected = [[2], [0], [1], [0], [1]]
    actual = constraint_violation(data, test_counterfactual, test_factual)

    assert expected == actual