def test_init(X_y_regression): X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=3, n_jobs=1) automl.search() assert automl.n_jobs == 1 assert isinstance(automl.rankings, pd.DataFrame) assert isinstance(automl.best_pipeline, PipelineBase) automl.best_pipeline.predict(X) # test with dataframes automl = AutoMLSearch(pd.DataFrame(X), pd.Series(y), problem_type='regression', objective="R2", max_iterations=3, n_jobs=1) automl.search() assert isinstance(automl.rankings, pd.DataFrame) assert isinstance(automl.full_rankings, pd.DataFrame) assert isinstance(automl.best_pipeline, PipelineBase) automl.best_pipeline.predict(X) assert isinstance(automl.get_pipeline(0), PipelineBase)
def test_init(X_y_binary): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, n_jobs=1) automl.search() assert automl.n_jobs == 1 assert isinstance(automl.rankings, pd.DataFrame) assert isinstance(automl.best_pipeline, PipelineBase) automl.best_pipeline.predict(X) # test with dataframes automl = AutoMLSearch(pd.DataFrame(X), pd.Series(y), problem_type='binary', max_iterations=1, n_jobs=1) automl.search() assert isinstance(automl.rankings, pd.DataFrame) assert isinstance(automl.full_rankings, pd.DataFrame) assert isinstance(automl.best_pipeline, PipelineBase) assert isinstance(automl.get_pipeline(0), PipelineBase) assert automl.objective.name == 'Log Loss Binary' automl.best_pipeline.predict(X)
def test_non_optimizable_threshold_multi(mock_fit, mock_score, X_y_multi): mock_score.return_value = {"Log Loss Multiclass": 0.5} X, y = X_y_multi automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective='Log Loss Multiclass', max_iterations=1) automl.search() mock_fit.assert_called() mock_score.assert_called() with pytest.raises(AttributeError): automl.best_pipeline.threshold automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', objective='Log Loss Multiclass', max_iterations=1, optimize_thresholds=True) automl.search() mock_fit.assert_called() mock_score.assert_called() with pytest.raises(AttributeError): automl.best_pipeline.threshold
def test_data_splitter(X_y_binary): X, y = X_y_binary cv_folds = 5 automl = AutoMLSearch( X_train=X, y_train=y, problem_type='binary', data_splitter=BalancedClassificationDataCVSplit(n_splits=cv_folds), max_iterations=1, n_jobs=1) automl.search() assert isinstance(automl.rankings, pd.DataFrame) assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=TimeSeriesSplit(n_splits=cv_folds), max_iterations=1, n_jobs=1) automl.search() assert isinstance(automl.rankings, pd.DataFrame) assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds
def test_early_stopping(caplog, logistic_regression_binary_pipeline_class, X_y_binary): X, y = X_y_binary with pytest.raises(ValueError, match='patience value must be a positive integer.'): automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=5, allowed_model_families=['linear_model'], patience=-1, random_seed=0) with pytest.raises(ValueError, match='tolerance value must be'): automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=5, allowed_model_families=['linear_model'], patience=1, tolerance=1.5, random_seed=0) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=5, allowed_model_families=['linear_model'], patience=2, tolerance=0.05, random_seed=0, n_jobs=1) mock_results = { 'search_order': [0, 1, 2], 'pipeline_results': {} } scores = [0.95, 0.84, 0.96] # 0.96 is only 1% greater so it doesn't trigger patience due to tolerance for id in mock_results['search_order']: mock_results['pipeline_results'][id] = {} mock_results['pipeline_results'][id]['score'] = scores[id] mock_results['pipeline_results'][id]['pipeline_class'] = logistic_regression_binary_pipeline_class automl._results = mock_results automl._check_stopping_condition(time.time()) out = caplog.text assert "2 iterations without improvement. Stopping search early." in out
def test_log_metrics_only_passed_directly(X_y_regression): X, y = X_y_regression with pytest.raises(ObjectiveNotFoundError, match="RootMeanSquaredLogError is not a valid Objective!"): AutoMLSearch(X_train=X, y_train=y, problem_type='regression', additional_objectives=['RootMeanSquaredLogError', 'MeanSquaredLogError']) ar = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', additional_objectives=[RootMeanSquaredLogError(), MeanSquaredLogError()]) assert ar.additional_objectives[0].name == 'Root Mean Squared Log Error' assert ar.additional_objectives[1].name == 'Mean Squared Log Error'
def test_random_seed(X_y_binary): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=5, random_seed=0, n_jobs=1) automl.search() automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=5, random_seed=0, n_jobs=1) automl_1.search() assert automl.rankings.equals(automl_1.rankings)
def test_random_seed(X_y_regression): X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0, n_jobs=1) automl.search() automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0, n_jobs=1) automl_1.search() # need to use assert_frame_equal as R2 could be different at the 10+ decimal assert pd.testing.assert_frame_equal(automl.rankings, automl_1.rankings) is None
def test_init_objective(X_y_binary): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=1) assert isinstance(automl.objective, Precision) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='Precision', max_iterations=1) assert isinstance(automl.objective, Precision)
def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y_binary): pytest.importorskip( 'IPython.display', reason='Skipping plotting test because ipywidgets not installed') go = pytest.importorskip( 'plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1) automl.search() mock_ipython_display.side_effect = ImportError('KABOOOOOOMMMM') plot = automl.plot.search_iteration_plot(interactive_plot=True) mock_ipython_display.assert_called_once() assert isinstance(plot, go.Figure) assert isinstance(plot.data, tuple) plot_data = plot.data[0] x = pd.Series(plot_data['x']) y = pd.Series(plot_data['y']) assert x.is_monotonic_increasing assert y.is_monotonic_increasing assert len(x) == 3 assert len(y) == 3
def test_optimizable_threshold_disabled(mock_fit, mock_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, X_y_binary): mock_optimize_threshold.return_value = 0.8 X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='precision', max_iterations=1, optimize_thresholds=False) mock_score.return_value = {automl.objective.name: 1.0} automl.search() mock_fit.assert_called() mock_score.assert_called() assert not mock_predict_proba.called assert not mock_optimize_threshold.called assert automl.best_pipeline.threshold == 0.5 assert automl.results['pipeline_results'][0]['cv_data'][0].get( 'binary_classification_threshold') == 0.5 assert automl.results['pipeline_results'][0]['cv_data'][1].get( 'binary_classification_threshold') == 0.5 assert automl.results['pipeline_results'][0]['cv_data'][2].get( 'binary_classification_threshold') == 0.5
def test_optimizable_threshold_enabled(mock_fit, mock_score, mock_predict_proba, mock_encode_targets, mock_optimize_threshold, X_y_binary, caplog): mock_optimize_threshold.return_value = 0.8 X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='precision', max_iterations=1, optimize_thresholds=True) mock_score.return_value = {'precision': 1.0} automl.search() mock_fit.assert_called() mock_score.assert_called() mock_predict_proba.assert_called() mock_optimize_threshold.assert_called() assert automl.best_pipeline.threshold == 0.8 assert automl.results['pipeline_results'][0]['cv_data'][0].get( 'binary_classification_threshold') == 0.8 assert automl.results['pipeline_results'][0]['cv_data'][1].get( 'binary_classification_threshold') == 0.8 assert automl.results['pipeline_results'][0]['cv_data'][2].get( 'binary_classification_threshold') == 0.8 automl.describe_pipeline(0) out = caplog.text assert "Objective to optimize binary classification pipeline thresholds for" in out
def test_callback(X_y_binary): X, y = X_y_binary counts = { "start_iteration_callback": 0, "add_result_callback": 0, } def start_iteration_callback(pipeline_class, parameters, automl_obj, counts=counts): counts["start_iteration_callback"] += 1 def add_result_callback(results, trained_pipeline, automl_obj, counts=counts): counts["add_result_callback"] += 1 max_iterations = 3 automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=max_iterations, start_iteration_callback=start_iteration_callback, add_result_callback=add_result_callback, n_jobs=1) automl.search() assert counts["start_iteration_callback"] == len( get_estimators('binary')) + 1 assert counts["add_result_callback"] == max_iterations
def test_automl_time_series_classification_pickle_generated_pipeline( mock_binary_fit, mock_multi_fit, mock_binary_score, mock_multiclass_score, problem_type, X_y_binary, X_y_multi): if problem_type == ProblemTypes.TIME_SERIES_BINARY: X, y = X_y_binary pipeline = GeneratedPipelineTimeSeriesBinary else: X, y = X_y_multi pipeline = GeneratedPipelineTimeSeriesMulticlass configuration = { "gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True } a = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, problem_configuration=configuration) a.search() for i, row in a.rankings.iterrows(): assert a.get_pipeline(row['id']).__class__ == pipeline assert pickle.loads(pickle.dumps(a.get_pipeline(row['id'])))
def test_callback(X_y_regression): X, y = X_y_regression counts = { "start_iteration_callback": 0, "add_result_callback": 0, } def start_iteration_callback(pipeline_class, parameters, automl_obj, counts=counts): counts["start_iteration_callback"] += 1 def add_result_callback(results, trained_pipeline, automl_obj, counts=counts): counts["add_result_callback"] += 1 max_iterations = 3 automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=max_iterations, start_iteration_callback=start_iteration_callback, add_result_callback=add_result_callback, n_jobs=1) automl.search() assert counts["start_iteration_callback"] == max_iterations assert counts["add_result_callback"] == max_iterations
def test_automl_supports_time_series_regression(mock_fit, mock_score, X_y_regression): X, y = X_y_regression configuration = { "gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True } automl = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration=configuration, max_batches=2) automl.search() assert isinstance(automl.data_splitter, TimeSeriesSplit) for result in automl.results['pipeline_results'].values(): if result["id"] == 0: assert result[ 'pipeline_class'] == TimeSeriesBaselineRegressionPipeline continue assert result['parameters'][ 'Delayed Feature Transformer'] == configuration assert result['parameters']['pipeline'] == configuration
def test_automl_allowed_pipelines_init_allowed_both_not_specified_multi( mock_fit, mock_score, X_y_multi, assert_allowed_pipelines_equal_helper): X, y = X_y_multi automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass', allowed_pipelines=None, allowed_model_families=None) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [ make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS) for estimator in get_estimators(ProblemTypes.MULTICLASS, model_families=None) ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( [p.model_family for p in expected_pipelines]) mock_fit.assert_called() mock_score.assert_called()
def test_early_stopping(caplog, linear_regression_pipeline_class, X_y_regression): X, y = X_y_regression tolerance = 0.005 patience = 2 automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective='mse', max_time='60 seconds', patience=patience, tolerance=tolerance, allowed_model_families=['linear_model'], random_state=0, n_jobs=1) mock_results = {'search_order': [0, 1, 2], 'pipeline_results': {}} scores = [150, 200, 195] for id in mock_results['search_order']: mock_results['pipeline_results'][id] = {} mock_results['pipeline_results'][id]['score'] = scores[id] mock_results['pipeline_results'][id][ 'pipeline_class'] = linear_regression_pipeline_class automl._results = mock_results automl._check_stopping_condition(time.time()) out = caplog.text assert "2 iterations without improvement. Stopping search early." in out
def test_automl_time_series_classification_threshold(mock_binary_fit, mock_binary_score, mock_predict_proba, mock_optimize_threshold, mock_split_data, optimize, objective, X_y_binary): X, y = X_y_binary mock_binary_score.return_value = {objective: 0.4} problem_type = 'time series binary' configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True} mock_optimize_threshold.return_value = 0.62 mock_split_data.return_value = split_data(X, y, problem_type, test_size=0.2, random_state=0) automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, problem_configuration=configuration, objective=objective, optimize_thresholds=optimize, max_batches=2) automl.search() assert isinstance(automl.data_splitter, TimeSeriesSplit) if objective == 'Log Loss Binary': mock_optimize_threshold.assert_not_called() assert automl.best_pipeline.threshold is None mock_split_data.assert_not_called() elif optimize and objective == 'F1': mock_optimize_threshold.assert_called() assert automl.best_pipeline.threshold == 0.62 mock_split_data.assert_called() assert str(mock_split_data.call_args[0][2]) == problem_type elif not optimize and objective == 'F1': mock_optimize_threshold.assert_not_called() assert automl.best_pipeline.threshold == 0.5 mock_split_data.assert_not_called()
def test_automl_supports_time_series_classification(mock_binary_fit, mock_multi_fit, mock_binary_score, mock_multiclass_score, problem_type, X_y_binary, X_y_multi): if problem_type == ProblemTypes.TIME_SERIES_BINARY: X, y = X_y_binary baseline = TimeSeriesBaselineBinaryPipeline mock_binary_score.return_value = {"Log Loss Binary": 0.2} problem_type = 'time series binary' else: X, y = X_y_multi baseline = TimeSeriesBaselineMulticlassPipeline mock_multiclass_score.return_value = {"Log Loss Multiclass": 0.25} problem_type = 'time series multiclass' configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True} automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, problem_configuration=configuration, max_batches=2) automl.search() assert isinstance(automl.data_splitter, TimeSeriesSplit) for result in automl.results['pipeline_results'].values(): if result["id"] == 0: assert result['pipeline_class'] == baseline continue assert result['parameters']['Delayed Feature Transformer'] == configuration assert result['parameters']['pipeline'] == configuration
def test_lead_scoring_objective(X_y_binary): X, y = X_y_binary objective = LeadScoring(true_positives=1, false_positives=-1) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=objective, max_iterations=1, random_seed=0) automl.search() pipeline = automl.best_pipeline pipeline.fit(X, y) pipeline.predict(X) pipeline.predict_proba(X) pipeline.score(X, y, [objective]) predicted = pd.Series([1, 10, .5, 5]) out = objective.decision_function(predicted, 1) y_true = pd.Series([False, True, False, True]) assert out.tolist() == [False, True, False, True] predicted = np.array([1, 10, .5, 5]) out = objective.decision_function(predicted, 1) assert out.tolist() == y_true.to_list() score = objective.score(out, y_true) assert (score == 0.5)
def test_automl_allowed_pipelines_specified_allowed_model_families_binary( mock_fit, mock_score, X_y_binary, assert_allowed_pipelines_equal_helper): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_pipelines=None, allowed_model_families=[ModelFamily.RANDOM_FOREST]) mock_score.return_value = {automl.objective.name: 1.0} expected_pipelines = [ make_pipeline(X, y, estimator, ProblemTypes.BINARY) for estimator in get_estimators( ProblemTypes.BINARY, model_families=[ModelFamily.RANDOM_FOREST]) ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( [ModelFamily.RANDOM_FOREST]) mock_fit.assert_called() mock_score.assert_called() mock_fit.reset_mock() mock_score.reset_mock() automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', allowed_pipelines=None, allowed_model_families=['random_forest']) expected_pipelines = [ make_pipeline(X, y, estimator, ProblemTypes.BINARY) for estimator in get_estimators( ProblemTypes.BINARY, model_families=[ModelFamily.RANDOM_FOREST]) ] assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) automl.search() assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines) assert set(automl.allowed_model_families) == set( [ModelFamily.RANDOM_FOREST]) mock_fit.assert_called() mock_score.assert_called()
def test_automl_allowed_pipelines_no_allowed_pipelines(X_y_regression): X, y = X_y_regression with pytest.raises(ValueError, match="No allowed pipelines to search"): AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=None, allowed_model_families=[])
def test_automl_allowed_pipelines_no_allowed_pipelines(automl_type, X_y_binary, X_y_multi): is_multiclass = automl_type == ProblemTypes.MULTICLASS X, y = X_y_multi if is_multiclass else X_y_binary problem_type = 'multiclass' if is_multiclass else 'binary' automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, allowed_pipelines=None, allowed_model_families=[]) assert automl.allowed_pipelines is None with pytest.raises(ValueError, match="No allowed pipelines to search"): automl.search()
def test_automl_time_series_regression_pickle_generated_pipeline(mock_fit, mock_score, X_y_regression): X, y = X_y_regression configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True} a = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration=configuration) a.search() for i, row in a.rankings.iterrows(): assert a.get_pipeline(row['id']).__class__ == GeneratedPipelineTimeSeriesRegression assert pickle.loads(pickle.dumps(a.get_pipeline(row['id'])))
def test_plot_disabled_missing_dependency(X_y_regression, has_minimal_dependencies): X, y = X_y_regression automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=3) if has_minimal_dependencies: with pytest.raises(AttributeError): automl.plot.search_iteration_plot else: automl.plot.search_iteration_plot
def test_binary_auto(X_y_binary): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="Log Loss Binary", max_iterations=5, n_jobs=1) automl.search() best_pipeline = automl.best_pipeline assert best_pipeline._is_fitted y_pred = best_pipeline.predict(X) assert len(np.unique(y_pred.to_series())) == 2
def test_recall_error(X_y_binary): X, y = X_y_binary # Recall is a valid objective but it's not allowed in AutoML so a ValueError is expected error_msg = 'recall is not allowed in AutoML!' with pytest.raises(ValueError, match=error_msg): AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='recall', max_iterations=1)
def test_categorical_classification(X_y_categorical_classification): X, y = X_y_categorical_classification automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="precision", max_iterations=5, n_jobs=1) automl.search() assert not automl.rankings["mean_cv_score"].isnull().all()
def test_max_time(X_y_binary): X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time=1e-16, n_jobs=1) automl.search() # search will always run at least one pipeline assert len(automl.results['pipeline_results']) == 1