def test_get_params(teardown): dummy1 = DummyEstimator(name="dummy1") dummy2 = DummyEstimator(x=456, y="def", name="dummy2") concat = Concatenate(name="concat") # a step without get_params/set_params # a meaningless pipeline that contains shared steps x1 = Input() x2 = Input() h = dummy1(x1) c = concat([x1, h]) y1 = dummy2(c) y2 = dummy2(x2, compute_func=lambda X: X * 2, trainable=False) model = Model([x1, x2], [y1, y2]) expected = { "dummy1": dummy1, "dummy2": dummy2, "concat": concat, "dummy1__x": 123, "dummy1__y": "abc", "dummy2__x": 456, "dummy2__y": "def", } params = model.get_params() assert params == expected
def test_with_missing_inputs(self, teardown): x1 = Input() x2 = Input() c = Concatenate()([x1, x2]) with pytest.raises(ValueError): Model(x1, c)
def test_concatenate(teardown): x1 = Input() x2 = Input() y = Concatenate(axis=1)([x1, x2]) model = Model([x1, x2], y) x1_data = np.array([[1, 2], [10, 20]]) x2_data = np.array([[3, 4, 5], [30, 40, 50]]) y_expected = np.concatenate([x1_data, x2_data], axis=1) y_pred = model.predict([x1_data, x2_data]) assert_array_equal(y_pred, y_expected)
def test_simple(self, teardown): x1 = Input() x2 = Input() y_t = Input() x1_transformed = PCA()(x1) y_t_encoded = LabelEncoder()(y_t) z = Concatenate()([x1_transformed, x2]) y = LogisticRegression()(z, y_t_encoded) # TODO: support shareable steps to reuse LabelEncoder with compute_func="inverse_transform" # full model Model([x1, x2], y, y_t) # submodels Model(x1, x1_transformed) Model(z, y, y_t_encoded)
def test_fit_predict_naive_stack_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y_p1 = LogisticRegression(random_state=random_state)( x, y_t, compute_func="predict_proba" ) y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)( x, y_t, compute_func="apply" ) y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1) y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2) features = Concatenate(axis=1)([y_p1, y_p2]) y_p = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y_p, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate( [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1 ) stacked = LogisticRegression(random_state=random_state) stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_get_params(teardown): pca = PCA(name="pca") logreg = LogisticRegression(name="logreg") concat = Concatenate(name="concat") # a step without get_params/set_params x = Input() h = pca(x) c = concat([x, h]) y = logreg(c) model = Model(x, y) expected = { "pca": pca, "logreg": logreg, "concat": concat, "pca__n_components": None, "pca__whiten": False, "pca__tol": 0.0, "pca__svd_solver": "auto", "pca__copy": True, "pca__random_state": None, "pca__iterated_power": "auto", "logreg__C": 1.0, "logreg__class_weight": None, "logreg__dual": False, "logreg__fit_intercept": True, "logreg__intercept_scaling": 1, "logreg__max_iter": 100, "logreg__multi_class": "warn", "logreg__n_jobs": None, "logreg__penalty": "l2", "logreg__random_state": None, "logreg__solver": "warn", "logreg__tol": 0.0001, "logreg__verbose": 0, "logreg__warm_start": False, "logreg__l1_ratio": None, } params = model.get_params() assert expected == params
def test_fit_predict_ensemble_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t) y2 = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state, function="apply" )(x, y_t) features = Concatenate(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = sklearn.ensemble.RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_set_params(teardown): dummy1 = DummyEstimator(name="dummy1") dummy2 = DummyEstimator(x=456, y="def", name="dummy2") concat = Concatenate(name="concat") # a step without get_params/set_params # a meaningless pipeline that contains shared steps x1 = Input() x2 = Input() h = dummy1(x1) c = concat([x1, h]) y1 = dummy2(c) y2 = dummy2(x2, compute_func=lambda X: X * 2, trainable=False) model = Model([x1, x2], [y1, y2]) # Fails when setting params on step that does not implement set_params new_params_wrong = {"concat__axis": 2} with pytest.raises(AttributeError): model.set_params(**new_params_wrong) # Fails when setting params on step that does not exist new_params_wrong = {"non_existent_step__param": 42} with pytest.raises(ValueError): model.set_params(**new_params_wrong) # Fails when setting a non-existent param in a step new_params_wrong = {"dummy1__non_existent_param": 42} with pytest.raises(ValueError): model.set_params(**new_params_wrong) new_dummy = DummyEstimator() new_params = { "dummy2": new_dummy, "dummy1__x": 100, "dummy1__y": "pqr", "dummy2__x": 789, "dummy2__y": "ijk", } model.set_params(**new_params) params = model.get_params() expected = { "dummy1": dummy1, "dummy2": new_dummy, "concat": concat, "dummy1__x": 100, "dummy1__y": "pqr", "dummy2__x": 789, "dummy2__y": "ijk", } assert params == expected # Connectivity of the new step should be the same as the old step assert new_dummy.name is dummy2.name for port in range(2): assert new_dummy.get_inputs_at(port) is dummy2.get_inputs_at(port) assert new_dummy.get_outputs_at(port) is dummy2.get_outputs_at(port) assert new_dummy.get_targets_at(port) is dummy2.get_targets_at(port) assert new_dummy.get_trainable_at(port) is dummy2.get_trainable_at(port) assert new_dummy.get_compute_func_at(port) is dummy2.get_compute_func_at(port)
def test_set_params(teardown): pca = PCA(name="pca") classifier = RandomForestClassifier(name="classifier") concat = Concatenate(name="concat") # a step without get_params/set_params x = Input() h = pca(x) c = concat([x, h]) y = classifier(c) model = Model(x, y) # Fails when setting params on step that does not implement set_params new_params_wrong = {"concat__axis": 2} with pytest.raises(AttributeError): model.set_params(**new_params_wrong) # Fails when setting params on step that does not exist new_params_wrong = {"non_existent_step__param": 42} with pytest.raises(ValueError): model.set_params(**new_params_wrong) # Fails when setting a non-existent param in a step new_params_wrong = {"pca__non_existent_param": 42} with pytest.raises(ValueError): model.set_params(**new_params_wrong) new_classifier = LogisticRegression() new_params = { "classifier": new_classifier, "pca__n_components": 4, "pca__whiten": True, "classifier__C": 100.0, "classifier__fit_intercept": False, "classifier__penalty": "l1", } model.set_params(**new_params) params = model.get_params() expected = { "pca": pca, "classifier": new_classifier, "concat": concat, "pca__n_components": 4, "pca__whiten": True, "pca__tol": 0.0, "pca__svd_solver": "auto", "pca__copy": True, "pca__random_state": None, "pca__iterated_power": "auto", "classifier__C": 100.0, "classifier__class_weight": None, "classifier__dual": False, "classifier__fit_intercept": False, "classifier__intercept_scaling": 1, "classifier__max_iter": 100, "classifier__multi_class": "warn", "classifier__n_jobs": None, "classifier__penalty": "l1", "classifier__random_state": None, "classifier__solver": "warn", "classifier__tol": 0.0001, "classifier__verbose": 0, "classifier__warm_start": False, "classifier__l1_ratio": None, } assert expected == params
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier) # ------- Load dataset data = sklearn.datasets.load_breast_cancer() X, y_p = data.data, data.target X_train, X_test, y_train, y_test = train_test_split(X, y_p, test_size=0.2, random_state=0) # ------- Build model x = Input() y_t = Input() y_p1 = LogisticRegression(function="predict_proba")(x, y_t) y_p2 = RandomForestClassifier(function="predict_proba")(x, y_t) ensemble_features = Concatenate()([y_p1, y_p2]) y_p = ExtraTreesClassifier()(ensemble_features, y_t) model = Model(x, y_p, y_t) plot_model(model, filename="stacked_classifiers.png", dpi=96) # ------- Train model model.fit(X_train, y_train) # ------- Evaluate model y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) print("F1 score on train data:", f1_score(y_train, y_train_pred)) print("F1 score on test data:", f1_score(y_test, y_test_pred))