示例#1
0
def test_get_params(teardown):
    dummy1 = DummyEstimator(name="dummy1")
    dummy2 = DummyEstimator(x=456, y="def", name="dummy2")
    concat = Concatenate(name="concat")  # a step without get_params/set_params

    # a meaningless pipeline that contains shared steps
    x1 = Input()
    x2 = Input()
    h = dummy1(x1)
    c = concat([x1, h])
    y1 = dummy2(c)
    y2 = dummy2(x2, compute_func=lambda X: X * 2, trainable=False)
    model = Model([x1, x2], [y1, y2])

    expected = {
        "dummy1": dummy1,
        "dummy2": dummy2,
        "concat": concat,
        "dummy1__x": 123,
        "dummy1__y": "abc",
        "dummy2__x": 456,
        "dummy2__y": "def",
    }

    params = model.get_params()
    assert params == expected
示例#2
0
    def test_with_missing_inputs(self, teardown):
        x1 = Input()
        x2 = Input()
        c = Concatenate()([x1, x2])

        with pytest.raises(ValueError):
            Model(x1, c)
示例#3
0
def test_concatenate(teardown):
    x1 = Input()
    x2 = Input()
    y = Concatenate(axis=1)([x1, x2])
    model = Model([x1, x2], y)

    x1_data = np.array([[1, 2], [10, 20]])
    x2_data = np.array([[3, 4, 5], [30, 40, 50]])
    y_expected = np.concatenate([x1_data, x2_data], axis=1)
    y_pred = model.predict([x1_data, x2_data])

    assert_array_equal(y_pred, y_expected)
示例#4
0
    def test_simple(self, teardown):
        x1 = Input()
        x2 = Input()
        y_t = Input()

        x1_transformed = PCA()(x1)
        y_t_encoded = LabelEncoder()(y_t)
        z = Concatenate()([x1_transformed, x2])
        y = LogisticRegression()(z, y_t_encoded)
        # TODO: support shareable steps to reuse LabelEncoder with compute_func="inverse_transform"

        # full model
        Model([x1, x2], y, y_t)

        # submodels
        Model(x1, x1_transformed)
        Model(z, y, y_t_encoded)
示例#5
0
def test_fit_predict_naive_stack_with_proba_features(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data[mask]
    y_t_data = iris.target[mask]
    random_state = 123
    n_estimators = 5

    # baikal way
    x = Input()
    y_t = Input()
    y_p1 = LogisticRegression(random_state=random_state)(
        x, y_t, compute_func="predict_proba"
    )
    y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)(
        x, y_t, compute_func="apply"
    )
    y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1)
    y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2)
    features = Concatenate(axis=1)([y_p1, y_p2])
    y_p = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y_p, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_proba = logreg.predict_proba(x_data)

    random_forest = RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state
    )
    random_forest.fit(x_data, y_t_data)
    random_forest_leafidx = random_forest.apply(x_data)

    features = np.concatenate(
        [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1
    )
    stacked = LogisticRegression(random_state=random_state)
    stacked.fit(features, y_t_data)
    y_pred_traditional = stacked.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
示例#6
0
def test_get_params(teardown):
    pca = PCA(name="pca")
    logreg = LogisticRegression(name="logreg")
    concat = Concatenate(name="concat")  # a step without get_params/set_params

    x = Input()
    h = pca(x)
    c = concat([x, h])
    y = logreg(c)
    model = Model(x, y)

    expected = {
        "pca": pca,
        "logreg": logreg,
        "concat": concat,
        "pca__n_components": None,
        "pca__whiten": False,
        "pca__tol": 0.0,
        "pca__svd_solver": "auto",
        "pca__copy": True,
        "pca__random_state": None,
        "pca__iterated_power": "auto",
        "logreg__C": 1.0,
        "logreg__class_weight": None,
        "logreg__dual": False,
        "logreg__fit_intercept": True,
        "logreg__intercept_scaling": 1,
        "logreg__max_iter": 100,
        "logreg__multi_class": "warn",
        "logreg__n_jobs": None,
        "logreg__penalty": "l2",
        "logreg__random_state": None,
        "logreg__solver": "warn",
        "logreg__tol": 0.0001,
        "logreg__verbose": 0,
        "logreg__warm_start": False,
        "logreg__l1_ratio": None,
    }

    params = model.get_params()
    assert expected == params
示例#7
0
def test_fit_predict_ensemble_with_proba_features(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data[mask]
    y_t_data = iris.target[mask]
    random_state = 123
    n_estimators = 5

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t)
    y2 = RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state, function="apply"
    )(x, y_t)
    features = Concatenate(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = sklearn.linear_model.LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_proba = logreg.predict_proba(x_data)

    random_forest = sklearn.ensemble.RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state
    )
    random_forest.fit(x_data, y_t_data)
    random_forest_leafidx = random_forest.apply(x_data)

    features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1)
    ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state)
    ensemble.fit(features, y_t_data)
    y_pred_traditional = ensemble.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
示例#8
0
def test_set_params(teardown):
    dummy1 = DummyEstimator(name="dummy1")
    dummy2 = DummyEstimator(x=456, y="def", name="dummy2")
    concat = Concatenate(name="concat")  # a step without get_params/set_params

    # a meaningless pipeline that contains shared steps
    x1 = Input()
    x2 = Input()
    h = dummy1(x1)
    c = concat([x1, h])
    y1 = dummy2(c)
    y2 = dummy2(x2, compute_func=lambda X: X * 2, trainable=False)
    model = Model([x1, x2], [y1, y2])

    # Fails when setting params on step that does not implement set_params
    new_params_wrong = {"concat__axis": 2}
    with pytest.raises(AttributeError):
        model.set_params(**new_params_wrong)

    # Fails when setting params on step that does not exist
    new_params_wrong = {"non_existent_step__param": 42}
    with pytest.raises(ValueError):
        model.set_params(**new_params_wrong)

    # Fails when setting a non-existent param in a step
    new_params_wrong = {"dummy1__non_existent_param": 42}
    with pytest.raises(ValueError):
        model.set_params(**new_params_wrong)

    new_dummy = DummyEstimator()
    new_params = {
        "dummy2": new_dummy,
        "dummy1__x": 100,
        "dummy1__y": "pqr",
        "dummy2__x": 789,
        "dummy2__y": "ijk",
    }

    model.set_params(**new_params)
    params = model.get_params()

    expected = {
        "dummy1": dummy1,
        "dummy2": new_dummy,
        "concat": concat,
        "dummy1__x": 100,
        "dummy1__y": "pqr",
        "dummy2__x": 789,
        "dummy2__y": "ijk",
    }

    assert params == expected

    # Connectivity of the new step should be the same as the old step
    assert new_dummy.name is dummy2.name
    for port in range(2):
        assert new_dummy.get_inputs_at(port) is dummy2.get_inputs_at(port)
        assert new_dummy.get_outputs_at(port) is dummy2.get_outputs_at(port)
        assert new_dummy.get_targets_at(port) is dummy2.get_targets_at(port)
        assert new_dummy.get_trainable_at(port) is dummy2.get_trainable_at(port)
        assert new_dummy.get_compute_func_at(port) is dummy2.get_compute_func_at(port)
示例#9
0
def test_set_params(teardown):
    pca = PCA(name="pca")
    classifier = RandomForestClassifier(name="classifier")
    concat = Concatenate(name="concat")  # a step without get_params/set_params

    x = Input()
    h = pca(x)
    c = concat([x, h])
    y = classifier(c)
    model = Model(x, y)

    # Fails when setting params on step that does not implement set_params
    new_params_wrong = {"concat__axis": 2}
    with pytest.raises(AttributeError):
        model.set_params(**new_params_wrong)

    # Fails when setting params on step that does not exist
    new_params_wrong = {"non_existent_step__param": 42}
    with pytest.raises(ValueError):
        model.set_params(**new_params_wrong)

    # Fails when setting a non-existent param in a step
    new_params_wrong = {"pca__non_existent_param": 42}
    with pytest.raises(ValueError):
        model.set_params(**new_params_wrong)

    new_classifier = LogisticRegression()
    new_params = {
        "classifier": new_classifier,
        "pca__n_components": 4,
        "pca__whiten": True,
        "classifier__C": 100.0,
        "classifier__fit_intercept": False,
        "classifier__penalty": "l1",
    }

    model.set_params(**new_params)
    params = model.get_params()

    expected = {
        "pca": pca,
        "classifier": new_classifier,
        "concat": concat,
        "pca__n_components": 4,
        "pca__whiten": True,
        "pca__tol": 0.0,
        "pca__svd_solver": "auto",
        "pca__copy": True,
        "pca__random_state": None,
        "pca__iterated_power": "auto",
        "classifier__C": 100.0,
        "classifier__class_weight": None,
        "classifier__dual": False,
        "classifier__fit_intercept": False,
        "classifier__intercept_scaling": 1,
        "classifier__max_iter": 100,
        "classifier__multi_class": "warn",
        "classifier__n_jobs": None,
        "classifier__penalty": "l1",
        "classifier__random_state": None,
        "classifier__solver": "warn",
        "classifier__tol": 0.0001,
        "classifier__verbose": 0,
        "classifier__warm_start": False,
        "classifier__l1_ratio": None,
    }

    assert expected == params
示例#10
0
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)

# ------- Load dataset
data = sklearn.datasets.load_breast_cancer()
X, y_p = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_p,
                                                    test_size=0.2,
                                                    random_state=0)

# ------- Build model
x = Input()
y_t = Input()
y_p1 = LogisticRegression(function="predict_proba")(x, y_t)
y_p2 = RandomForestClassifier(function="predict_proba")(x, y_t)
ensemble_features = Concatenate()([y_p1, y_p2])
y_p = ExtraTreesClassifier()(ensemble_features, y_t)

model = Model(x, y_p, y_t)
plot_model(model, filename="stacked_classifiers.png", dpi=96)

# ------- Train model
model.fit(X_train, y_train)

# ------- Evaluate model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("F1 score on train data:", f1_score(y_train, y_train_pred))
print("F1 score on test data:", f1_score(y_test, y_test_pred))