def test_fit_predict_naive_stack(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, solver="liblinear")(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state, solver="liblinear")(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state, solver="liblinear") logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) stacked = LogisticRegression(random_state=random_state, solver="liblinear") stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_nested_model_stack(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 n_components = 2 # ----------- baikal way stacked_model_baikal = make_naive_stacked_model( n_components, random_state, x_data, y_t_data ) y_pred_baikal = stacked_model_baikal.predict(x_data) # ----------- traditional way # Submodel 1 submodel1 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_data) pca_trans = pca.transform(x_data) submodel1.fit(pca_trans, y_t_data) submodel1_pred = submodel1.predict(pca_trans) # Submodel 2 (a nested stacked model) random_forest = RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) extra_trees = ExtraTreesClassifier(random_state=random_state) extra_trees.fit(x_data, y_t_data) extra_trees_pred = extra_trees.predict(x_data) features = np.stack([random_forest_pred, extra_trees_pred], axis=1) submodel2 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) submodel2.fit(features, y_t_data) submodel2_pred = submodel2.predict(features) # Stacked model features = np.stack([submodel1_pred, submodel2_pred], axis=1) stacked_model_traditional = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) stacked_model_traditional.fit(features, y_t_data) y_pred_traditional = stacked_model_traditional.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_naive_stack_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y_p1 = LogisticRegression(random_state=random_state)( x, y_t, compute_func="predict_proba" ) y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)( x, y_t, compute_func="apply" ) y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1) y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2) features = Concatenate(axis=1)([y_p1, y_p2]) y_p = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y_p, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate( [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1 ) stacked = LogisticRegression(random_state=random_state) stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)