def test_additional_objectives(X_y_binary): X, y = X_y_binary objective = FraudCost(retry_percentage=.5, interchange_fee=.02, fraud_payout_percentage=.75, amount_col=10) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='F1', max_iterations=2, additional_objectives=[objective], n_jobs=1) automl.search() results = automl.describe_pipeline(0, return_dict=True) assert 'Fraud Cost' in list( results["cv_data"][0]["all_objective_scores"].keys())
def test_random_seed(X_y_regression): X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0, n_jobs=1) automl.search() automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0, n_jobs=1) automl_1.search() # need to use assert_frame_equal as R2 could be different at the 10+ decimal assert pd.testing.assert_frame_equal(automl.rankings, automl_1.rankings) is None
def test_recall_error(X_y_binary): X, y = X_y_binary # Recall is a valid objective but it's not allowed in AutoML so a ValueError is expected error_msg = 'recall is not allowed in AutoML!' with pytest.raises(ValueError, match=error_msg): AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='recall', max_iterations=1)
def test_automl_time_series_classification_threshold( mock_binary_fit, mock_binary_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, mock_split_data, optimize, objective, X_y_binary): X, y = X_y_binary mock_binary_score.return_value = {objective: 0.4} problem_type = 'time series binary' configuration = { "gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True } mock_optimize_threshold.return_value = 0.62 mock_split_data.return_value = split_data(X, y, problem_type, test_size=0.2, random_seed=0) automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, problem_configuration=configuration, objective=objective, optimize_thresholds=optimize, max_batches=2) automl.search() assert isinstance(automl.data_splitter, TimeSeriesSplit) if objective == 'Log Loss Binary': mock_optimize_threshold.assert_not_called() assert automl.best_pipeline.threshold is None mock_split_data.assert_not_called() elif optimize and objective == 'F1': mock_optimize_threshold.assert_called() assert automl.best_pipeline.threshold == 0.62 mock_split_data.assert_called() assert str(mock_split_data.call_args[0][2]) == problem_type elif not optimize and objective == 'F1': mock_optimize_threshold.assert_not_called() assert automl.best_pipeline.threshold == 0.5 mock_split_data.assert_not_called()
def test_non_optimizable_threshold(mock_fit, mock_score, X_y_binary): mock_score.return_value = {"AUC": 1.0} X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', optimize_thresholds=False, max_iterations=1) automl.search() mock_fit.assert_called() mock_score.assert_called() assert automl.best_pipeline.threshold is None assert automl.results['pipeline_results'][0]['cv_data'][0].get( 'binary_classification_threshold') is None assert automl.results['pipeline_results'][0]['cv_data'][1].get( 'binary_classification_threshold') is None assert automl.results['pipeline_results'][0]['cv_data'][2].get( 'binary_classification_threshold') is None
def test_automl_allowed_pipelines_specified_allowed_pipelines( mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression): X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=[dummy_regression_pipeline_class], allowed_model_families=None) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [dummy_regression_pipeline_class] mock_score.return_value = {automl.objective.name: 1.0} assert automl.allowed_pipelines == expected_pipelines assert automl.allowed_model_families == [ModelFamily.NONE] automl.search() mock_fit.assert_called() mock_score.assert_called() assert automl.allowed_pipelines == expected_pipelines assert automl.allowed_model_families == [ModelFamily.NONE]
def test_automl_allowed_pipelines_init_allowed_both_specified( mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression, assert_allowed_pipelines_equal_helper): X, y = X_y_regression automl = AutoMLSearch( X_train=X, y_train=y, problem_type='regression', allowed_pipelines=[dummy_regression_pipeline_class({})], allowed_model_families=[ModelFamily.RANDOM_FOREST]) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [dummy_regression_pipeline_class({})] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( [p.model_family for p in expected_pipelines]) automl.search() mock_fit.assert_called() mock_score.assert_called()
def test_fraud_objective(X_y_binary): X, y = X_y_binary objective = FraudCost(retry_percentage=.5, interchange_fee=.02, fraud_payout_percentage=.75, amount_col=10) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=objective, max_iterations=1) automl.search() pipeline = automl.best_pipeline pipeline.fit(X, y) pipeline.predict(X, objective) pipeline.predict_proba(X) pipeline.score(X, y, [objective])
def test_automl_regression_nonlinear_pipeline_search( nonlinear_regression_pipeline_class, X_y_regression): X, y = X_y_regression allowed_pipelines = [nonlinear_regression_pipeline_class] start_iteration_callback = MagicMock() automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=2, start_iteration_callback=start_iteration_callback, allowed_pipelines=allowed_pipelines, n_jobs=1) automl.search() assert start_iteration_callback.call_count == 2 assert start_iteration_callback.call_args_list[0][0][ 0] == MeanBaselineRegressionPipeline assert start_iteration_callback.call_args_list[1][0][ 0] == nonlinear_regression_pipeline_class
def test_automl_binary_nonlinear_pipeline_search( nonlinear_binary_pipeline_class, X_y_binary): X, y = X_y_binary allowed_pipelines = [nonlinear_binary_pipeline_class] start_iteration_callback = MagicMock() automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=2, start_iteration_callback=start_iteration_callback, allowed_pipelines=allowed_pipelines, n_jobs=1) automl.search() assert start_iteration_callback.call_count == 2 assert start_iteration_callback.call_args_list[0][0][ 0] == ModeBaselineBinaryPipeline assert start_iteration_callback.call_args_list[1][0][ 0] == nonlinear_binary_pipeline_class
def test_automl_allowed_pipelines_no_allowed_pipelines(automl_type, X_y_binary, X_y_multi): is_multiclass = automl_type == ProblemTypes.MULTICLASS X, y = X_y_multi if is_multiclass else X_y_binary problem_type = 'multiclass' if is_multiclass else 'binary' with pytest.raises(ValueError, match="No allowed pipelines to search"): AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, allowed_pipelines=None, allowed_model_families=[])
def test_plot_iterations_ipython_mock(mock_ipython_display, X_y_binary): pytest.importorskip( 'IPython.display', reason='Skipping plotting test because ipywidgets not installed') pytest.importorskip( 'plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1) automl.search() plot = automl.plot.search_iteration_plot(interactive_plot=True) assert isinstance(plot, SearchIterationPlot) assert isinstance(plot.data, AutoMLSearch) mock_ipython_display.assert_called_with(plot.best_score_by_iter_fig)
def test_cbm_objective_automl(optimize_thresholds, X_y_binary): X, y = X_y_binary cbm = CostBenefitMatrix(true_positive=10, true_negative=-1, false_positive=-7, false_negative=-2) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=cbm, max_iterations=2, optimize_thresholds=optimize_thresholds) automl.search() pipeline = automl.best_pipeline pipeline.fit(X, y) predictions = pipeline.predict(X, cbm) assert not np.isnan(predictions.to_series()).values.any() assert not np.isnan(pipeline.predict_proba(X).to_dataframe()).values.any() assert not np.isnan(pipeline.score(X, y, [cbm])['Cost Benefit Matrix'])
def test_categorical_hyperparam(X_y_multi): X, y = X_y_multi class CustomPipeline(MulticlassClassificationPipeline): component_graph = [ 'Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier' ] custom_hyperparameters = { 'Simple Imputer': { 'impute_strategy': Categorical(['mean', 'most_frequent']) } } automl = AutoMLSearch(X_train=X, y_train=y, problem_type="multiclass", allowed_pipelines=[CustomPipeline], n_jobs=1) automl.search()
def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y_binary): pytest.importorskip('IPython.display', reason='Skipping plotting test because ipywidgets not installed') go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1) automl.search() mock_ipython_display.side_effect = ImportError('KABOOOOOOMMMM') plot = automl.plot.search_iteration_plot(interactive_plot=True) mock_ipython_display.assert_called_once() assert isinstance(plot, go.Figure) assert isinstance(plot.data, tuple) plot_data = plot.data[0] x = pd.Series(plot_data['x']) y = pd.Series(plot_data['y']) assert x.is_monotonic_increasing assert y.is_monotonic_increasing assert len(x) == 3 assert len(y) == 3
def test_automl_allowed_pipelines_search(mock_fit, mock_score, dummy_binary_pipeline_class, X_y_binary): X, y = X_y_binary mock_score.return_value = {'Log Loss Binary': 1.0} allowed_pipelines = [dummy_binary_pipeline_class] start_iteration_callback = MagicMock() automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=2, start_iteration_callback=start_iteration_callback, allowed_pipelines=allowed_pipelines) automl.search() assert start_iteration_callback.call_count == 2 assert start_iteration_callback.call_args_list[0][0][ 0] == ModeBaselineBinaryPipeline assert start_iteration_callback.call_args_list[1][0][ 0] == dummy_binary_pipeline_class
def test_log_metrics_only_passed_directly(X_y_regression): X, y = X_y_regression with pytest.raises( ObjectiveNotFoundError, match="RootMeanSquaredLogError is not a valid Objective!"): AutoMLSearch(X_train=X, y_train=y, problem_type='regression', additional_objectives=[ 'RootMeanSquaredLogError', 'MeanSquaredLogError' ]) ar = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', additional_objectives=[ RootMeanSquaredLogError(), MeanSquaredLogError() ]) assert ar.additional_objectives[0].name == 'Root Mean Squared Log Error' assert ar.additional_objectives[1].name == 'Mean Squared Log Error'
def test_automl_allowed_pipelines_search(mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression): X, y = X_y_regression mock_score.return_value = {'R2': 1.0} allowed_pipelines = [dummy_regression_pipeline_class] start_iteration_callback = MagicMock() automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=2, start_iteration_callback=start_iteration_callback, allowed_pipelines=allowed_pipelines) automl.search() assert start_iteration_callback.call_count == 2 assert start_iteration_callback.call_args_list[0][0][ 0] == MeanBaselineRegressionPipeline assert start_iteration_callback.call_args_list[1][0][ 0] == dummy_regression_pipeline_class
def test_automl_multiclass_nonlinear_pipeline_search_more_iterations( nonlinear_multiclass_pipeline_class, X_y_multi): X, y = X_y_multi allowed_pipelines = [nonlinear_multiclass_pipeline_class] start_iteration_callback = MagicMock() automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', max_iterations=5, start_iteration_callback=start_iteration_callback, allowed_pipelines=allowed_pipelines, n_jobs=1) automl.search() assert start_iteration_callback.call_args_list[0][0][ 0] == ModeBaselineMulticlassPipeline assert start_iteration_callback.call_args_list[1][0][ 0] == nonlinear_multiclass_pipeline_class assert start_iteration_callback.call_args_list[4][0][ 0] == nonlinear_multiclass_pipeline_class
def test_multi_objective(X_y_multi): X, y = X_y_multi automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="Log Loss Binary") assert automl.problem_type == ProblemTypes.BINARY automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective="Log Loss Multiclass") assert automl.problem_type == ProblemTypes.MULTICLASS automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective='AUC Micro') assert automl.problem_type == ProblemTypes.MULTICLASS automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC') assert automl.problem_type == ProblemTypes.BINARY automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass') assert automl.problem_type == ProblemTypes.MULTICLASS automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary') assert automl.problem_type == ProblemTypes.BINARY
def test_optimizable_threshold_enabled(mock_fit, mock_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, X_y_binary, caplog): mock_optimize_threshold.return_value = 0.8 X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='precision', max_iterations=1, optimize_thresholds=True) mock_score.return_value = {'precision': 1.0} automl.search() mock_fit.assert_called() mock_score.assert_called() mock_predict_proba.assert_called() mock_optimize_threshold.assert_called() assert automl.best_pipeline.threshold == 0.8 assert automl.results['pipeline_results'][0]['cv_data'][0].get( 'binary_classification_threshold') == 0.8 assert automl.results['pipeline_results'][0]['cv_data'][1].get( 'binary_classification_threshold') == 0.8 assert automl.results['pipeline_results'][0]['cv_data'][2].get( 'binary_classification_threshold') == 0.8 automl.describe_pipeline(0) out = caplog.text assert "Objective to optimize binary classification pipeline thresholds for" in out
def test_automl_time_series_classification_pickle_generated_pipeline( mock_binary_fit, mock_multi_fit, mock_binary_score, mock_multiclass_score, problem_type, X_y_binary, X_y_multi): if problem_type == ProblemTypes.TIME_SERIES_BINARY: X, y = X_y_binary pipeline = GeneratedPipelineTimeSeriesBinary else: X, y = X_y_multi pipeline = GeneratedPipelineTimeSeriesMulticlass configuration = { "gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True } a = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, problem_configuration=configuration) a.search() for i, row in a.rankings.iterrows(): assert a.get_pipeline(row['id']).__class__ == pipeline assert pickle.loads(pickle.dumps(a.get_pipeline(row['id'])))
def test_plot_iterations_max_iterations(X_y_regression): go = pytest.importorskip( 'plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=3, n_jobs=1) automl.search() plot = automl.plot.search_iteration_plot() plot_data = plot.data[0] x = pd.Series(plot_data['x']) y = pd.Series(plot_data['y']) assert isinstance(plot, go.Figure) assert x.is_monotonic_increasing assert y.is_monotonic_increasing assert len(x) == 3 assert len(y) == 3
def test_plot_disabled_missing_dependency(X_y_regression, has_minimal_dependencies): X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=3) if has_minimal_dependencies: with pytest.raises(AttributeError): automl.plot.search_iteration_plot else: automl.plot.search_iteration_plot
def test_callback(X_y_regression): X, y = X_y_regression counts = { "start_iteration_callback": 0, "add_result_callback": 0, } def start_iteration_callback(pipeline_class, parameters, automl_obj, counts=counts): counts["start_iteration_callback"] += 1 def add_result_callback(results, trained_pipeline, automl_obj, counts=counts): counts["add_result_callback"] += 1 max_iterations = 3 automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=max_iterations, start_iteration_callback=start_iteration_callback, add_result_callback=add_result_callback, n_jobs=1) automl.search() assert counts["start_iteration_callback"] == max_iterations assert counts["add_result_callback"] == max_iterations
def test_automl_allowed_pipelines_init_allowed_both_not_specified( mock_fit, mock_score, X_y_regression, assert_allowed_pipelines_equal_helper): X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=None, allowed_model_families=None) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [ make_pipeline(X, y, estimator, ProblemTypes.REGRESSION) for estimator in get_estimators(ProblemTypes.REGRESSION, model_families=None) ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( [p.model_family for p in expected_pipelines]) automl.search() mock_fit.assert_called() mock_score.assert_called()
def test_automl_allowed_pipelines_init_allowed_both_specified_multi( mock_fit, mock_score, dummy_multiclass_pipeline_class, X_y_multi, assert_allowed_pipelines_equal_helper): X, y = X_y_multi automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=[dummy_multiclass_pipeline_class], allowed_model_families=[ModelFamily.RANDOM_FOREST]) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [dummy_multiclass_pipeline_class] assert automl.allowed_pipelines == expected_pipelines # the dummy multiclass pipeline estimator has model family NONE assert set(automl.allowed_model_families) == set([ModelFamily.NONE]) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( [p.model_family for p in expected_pipelines]) mock_fit.assert_called() mock_score.assert_called()
def test_early_stopping(caplog, linear_regression_pipeline_class, X_y_regression): X, y = X_y_regression tolerance = 0.005 patience = 2 automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective='mse', max_time='60 seconds', patience=patience, tolerance=tolerance, allowed_model_families=['linear_model'], random_seed=0, n_jobs=1) mock_results = { 'search_order': [0, 1, 2], 'pipeline_results': {} } scores = [150, 200, 195] for id in mock_results['search_order']: mock_results['pipeline_results'][id] = {} mock_results['pipeline_results'][id]['score'] = scores[id] mock_results['pipeline_results'][id]['pipeline_class'] = linear_regression_pipeline_class automl._results = mock_results automl._check_stopping_condition(time.time()) out = caplog.text assert "2 iterations without improvement. Stopping search early." in out
def test_plot_iterations_max_time(X_y_binary): go = pytest.importorskip( 'plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_time=10, n_jobs=1) automl.search(show_iteration_plot=False) plot = automl.plot.search_iteration_plot() plot_data = plot.data[0] x = pd.Series(plot_data['x']) y = pd.Series(plot_data['y']) assert isinstance(plot, go.Figure) assert x.is_monotonic_increasing assert y.is_monotonic_increasing assert len(x) > 0 assert len(y) > 0
def test_data_splitter(X_y_binary): X, y = X_y_binary cv_folds = 5 automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=BalancedClassificationDataCVSplit(n_splits=cv_folds), max_iterations=1, n_jobs=1) automl.search() assert isinstance(automl.rankings, pd.DataFrame) assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=TimeSeriesSplit(n_splits=cv_folds), max_iterations=1, n_jobs=1) automl.search() assert isinstance(automl.rankings, pd.DataFrame) assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds