def test_stacked_ensemble_init_with_invalid_estimators_parameter(): with pytest.raises(EnsembleMissingPipelinesError, match='must not be None or an empty list.'): StackedEnsembleRegressor() with pytest.raises(EnsembleMissingPipelinesError, match='must not be None or an empty list.'): StackedEnsembleRegressor(input_pipelines=[])
def test_stacked_feature_importance(mock_fit, X_y_regression, stackable_regressors): X, y = X_y_regression input_pipelines = [ RegressionPipeline([regressor]) for regressor in stackable_regressors ] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1) clf.fit(X, y) mock_fit.assert_called() clf._is_fitted = True with pytest.raises(NotImplementedError, match="feature_importance is not implemented"): clf.feature_importance
def _make_stacked_ensemble_pipeline(input_pipelines, problem_type, n_jobs=-1, random_seed=0): """ Creates a pipeline with a stacked ensemble estimator. Arguments: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators for the stacked ensemble. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. problem_type (ProblemType): problem type of pipeline n_jobs (int or None): Integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Defaults to -1. Returns: Pipeline with appropriate stacked ensemble estimator. """ if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return make_pipeline_from_components( [StackedEnsembleClassifier(input_pipelines, n_jobs=n_jobs)], problem_type, custom_name="Stacked Ensemble Classification Pipeline", random_seed=random_seed) else: return make_pipeline_from_components( [StackedEnsembleRegressor(input_pipelines, n_jobs=n_jobs)], problem_type, custom_name="Stacked Ensemble Regression Pipeline", random_seed=random_seed)
def test_stacked_ensemble_n_jobs_negative_one( X_y_regression, linear_regression_pipeline_class): X, y = X_y_regression input_pipelines = [linear_regression_pipeline_class(parameters={})] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines) expected_parameters = { "input_pipelines": input_pipelines, "final_estimator": None, 'cv': None, 'n_jobs': -1 } assert clf.parameters == expected_parameters clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) assert not np.isnan(y_pred.to_series()).all()
def test_stacked_different_input_pipelines_regression(): input_pipelines = [ RegressionPipeline([RandomForestRegressor]), BinaryClassificationPipeline([RandomForestClassifier]) ] with pytest.raises(ValueError, match="All pipelines must have the same problem type."): StackedEnsembleRegressor(input_pipelines=input_pipelines)
def test_stacked_ensemble_nonstackable_model_families(): with pytest.raises( ValueError, match= "Pipelines with any of the following model families cannot be used as base pipelines" ): StackedEnsembleRegressor( input_pipelines=[RegressionPipeline([BaselineRegressor])])
def test_stacked_fit_predict_regression(X_y_regression, stackable_regressors): X, y = X_y_regression input_pipelines = [ RegressionPipeline([regressor]) for regressor in stackable_regressors ] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) assert isinstance(y_pred, ww.DataColumn) assert not np.isnan(y_pred.to_series()).all() clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, final_estimator=RandomForestRegressor(), n_jobs=1) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) assert isinstance(y_pred, ww.DataColumn) assert not np.isnan(y_pred.to_series()).all()
def test_stacked_ensemble_does_not_overwrite_pipeline_random_seed( mock_stack, linear_regression_pipeline_class): input_pipelines = [ linear_regression_pipeline_class(parameters={}, random_seed=3), linear_regression_pipeline_class(parameters={}, random_seed=4) ] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, random_seed=5, n_jobs=1) estimators_used_in_ensemble = mock_stack.call_args[1]['estimators'] assert clf.random_seed == 5 assert estimators_used_in_ensemble[0][1].pipeline.random_seed == 3 assert estimators_used_in_ensemble[1][1].pipeline.random_seed == 4
def test_stacked_ensemble_init_with_multiple_same_estimators( X_y_regression, linear_regression_pipeline_class): # Checks that it is okay to pass multiple of the same type of estimator X, y = X_y_regression input_pipelines = [ linear_regression_pipeline_class(parameters={}), linear_regression_pipeline_class(parameters={}) ] clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1) expected_parameters = { "input_pipelines": input_pipelines, "final_estimator": None, 'cv': None, 'n_jobs': 1 } assert clf.parameters == expected_parameters fitted = clf.fit(X, y) assert isinstance(fitted, StackedEnsembleRegressor) y_pred = clf.predict(X) assert len(y_pred) == len(y) assert not np.isnan(y_pred.to_series()).all()
def _make_stacked_ensemble_pipeline(input_pipelines, problem_type, random_state=0): """ Creates a pipeline with a stacked ensemble estimator. Arguments: input_pipelines (list(PipelineBase or subclass obj)): List of pipeline instances to use as the base estimators for the stacked ensemble. This must not be None or an empty list or else EnsembleMissingPipelinesError will be raised. problem_type (ProblemType): problem type of pipeline Returns: Pipeline with appropriate stacked ensemble estimator. """ if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return make_pipeline_from_components([StackedEnsembleClassifier(input_pipelines)], problem_type, custom_name="Stacked Ensemble Classification Pipeline", random_state=random_state) else: return make_pipeline_from_components([StackedEnsembleRegressor(input_pipelines)], problem_type, custom_name="Stacked Ensemble Regression Pipeline", random_state=random_state)
def test_stacked_ensemble_multilevel(linear_regression_pipeline_class): # checks passing a stacked ensemble classifier as a final estimator X = pd.DataFrame(np.random.rand(50, 5)) y = pd.Series(np.random.rand(50, )) base = StackedEnsembleRegressor( input_pipelines=[linear_regression_pipeline_class(parameters={})], n_jobs=1) clf = StackedEnsembleRegressor( input_pipelines=[linear_regression_pipeline_class(parameters={})], final_estimator=base, n_jobs=1) clf.fit(X, y) y_pred = clf.predict(X) assert len(y_pred) == len(y) assert not np.isnan(y_pred.to_series()).all()
def dummy_stacked_ensemble_regressor_estimator( linear_regression_pipeline_class): p1 = linear_regression_pipeline_class({}) ensemble_estimator = StackedEnsembleRegressor(input_pipelines=[p1], random_seed=0) return ensemble_estimator