def test_additional_objectives(X_y_binary):
    X, y = X_y_binary

    objective = FraudCost(retry_percentage=.5,
                          interchange_fee=.02,
                          fraud_payout_percentage=.75,
                          amount_col=10)
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective='F1',
                          max_iterations=2,
                          additional_objectives=[objective],
                          n_jobs=1)
    automl.search()

    results = automl.describe_pipeline(0, return_dict=True)
    assert 'Fraud Cost' in list(
        results["cv_data"][0]["all_objective_scores"].keys())
def test_random_seed(X_y_regression):
    X, y = X_y_regression
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0,
                          n_jobs=1)
    automl.search()

    automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0,
                            n_jobs=1)
    automl_1.search()

    # need to use assert_frame_equal as R2 could be different at the 10+ decimal
    assert pd.testing.assert_frame_equal(automl.rankings, automl_1.rankings) is None
def test_recall_error(X_y_binary):
    X, y = X_y_binary
    # Recall is a valid objective but it's not allowed in AutoML so a ValueError is expected
    error_msg = 'recall is not allowed in AutoML!'
    with pytest.raises(ValueError, match=error_msg):
        AutoMLSearch(X_train=X,
                     y_train=y,
                     problem_type='binary',
                     objective='recall',
                     max_iterations=1)
예제 #4
0
def test_automl_time_series_classification_threshold(
        mock_binary_fit, mock_binary_score, mock_predict_proba,
        mock_encode_targets, mock_optimize_threshold, mock_split_data,
        optimize, objective, X_y_binary):
    X, y = X_y_binary
    mock_binary_score.return_value = {objective: 0.4}
    problem_type = 'time series binary'

    configuration = {
        "gap": 0,
        "max_delay": 0,
        'delay_target': False,
        'delay_features': True
    }

    mock_optimize_threshold.return_value = 0.62
    mock_split_data.return_value = split_data(X,
                                              y,
                                              problem_type,
                                              test_size=0.2,
                                              random_seed=0)
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type=problem_type,
                          problem_configuration=configuration,
                          objective=objective,
                          optimize_thresholds=optimize,
                          max_batches=2)
    automl.search()
    assert isinstance(automl.data_splitter, TimeSeriesSplit)
    if objective == 'Log Loss Binary':
        mock_optimize_threshold.assert_not_called()
        assert automl.best_pipeline.threshold is None
        mock_split_data.assert_not_called()
    elif optimize and objective == 'F1':
        mock_optimize_threshold.assert_called()
        assert automl.best_pipeline.threshold == 0.62
        mock_split_data.assert_called()
        assert str(mock_split_data.call_args[0][2]) == problem_type
    elif not optimize and objective == 'F1':
        mock_optimize_threshold.assert_not_called()
        assert automl.best_pipeline.threshold == 0.5
        mock_split_data.assert_not_called()
예제 #5
0
def test_non_optimizable_threshold(mock_fit, mock_score, X_y_binary):
    mock_score.return_value = {"AUC": 1.0}
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective='AUC',
                          optimize_thresholds=False,
                          max_iterations=1)
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    assert automl.best_pipeline.threshold is None
    assert automl.results['pipeline_results'][0]['cv_data'][0].get(
        'binary_classification_threshold') is None
    assert automl.results['pipeline_results'][0]['cv_data'][1].get(
        'binary_classification_threshold') is None
    assert automl.results['pipeline_results'][0]['cv_data'][2].get(
        'binary_classification_threshold') is None
예제 #6
0
def test_automl_allowed_pipelines_specified_allowed_pipelines(
        mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression):
    X, y = X_y_regression
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          allowed_pipelines=[dummy_regression_pipeline_class],
                          allowed_model_families=None)
    mock_score.return_value = {automl.objective.name: 1.0}
    expected_pipelines = [dummy_regression_pipeline_class]
    mock_score.return_value = {automl.objective.name: 1.0}
    assert automl.allowed_pipelines == expected_pipelines
    assert automl.allowed_model_families == [ModelFamily.NONE]

    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    assert automl.allowed_pipelines == expected_pipelines
    assert automl.allowed_model_families == [ModelFamily.NONE]
def test_automl_allowed_pipelines_init_allowed_both_specified(
        mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression,
        assert_allowed_pipelines_equal_helper):
    X, y = X_y_regression
    automl = AutoMLSearch(
        X_train=X,
        y_train=y,
        problem_type='regression',
        allowed_pipelines=[dummy_regression_pipeline_class({})],
        allowed_model_families=[ModelFamily.RANDOM_FOREST])
    mock_score.return_value = {automl.objective.name: 1.0}
    expected_pipelines = [dummy_regression_pipeline_class({})]
    assert_allowed_pipelines_equal_helper(automl.allowed_pipelines,
                                          expected_pipelines)
    assert set(automl.allowed_model_families) == set(
        [p.model_family for p in expected_pipelines])
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
예제 #8
0
def test_fraud_objective(X_y_binary):
    X, y = X_y_binary

    objective = FraudCost(retry_percentage=.5,
                          interchange_fee=.02,
                          fraud_payout_percentage=.75,
                          amount_col=10)

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective=objective,
                          max_iterations=1)
    automl.search()

    pipeline = automl.best_pipeline
    pipeline.fit(X, y)
    pipeline.predict(X, objective)
    pipeline.predict_proba(X)
    pipeline.score(X, y, [objective])
예제 #9
0
def test_automl_regression_nonlinear_pipeline_search(
        nonlinear_regression_pipeline_class, X_y_regression):
    X, y = X_y_regression

    allowed_pipelines = [nonlinear_regression_pipeline_class]
    start_iteration_callback = MagicMock()
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          max_iterations=2,
                          start_iteration_callback=start_iteration_callback,
                          allowed_pipelines=allowed_pipelines,
                          n_jobs=1)
    automl.search()

    assert start_iteration_callback.call_count == 2
    assert start_iteration_callback.call_args_list[0][0][
        0] == MeanBaselineRegressionPipeline
    assert start_iteration_callback.call_args_list[1][0][
        0] == nonlinear_regression_pipeline_class
def test_automl_binary_nonlinear_pipeline_search(
        nonlinear_binary_pipeline_class, X_y_binary):
    X, y = X_y_binary

    allowed_pipelines = [nonlinear_binary_pipeline_class]
    start_iteration_callback = MagicMock()
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_iterations=2,
                          start_iteration_callback=start_iteration_callback,
                          allowed_pipelines=allowed_pipelines,
                          n_jobs=1)
    automl.search()

    assert start_iteration_callback.call_count == 2
    assert start_iteration_callback.call_args_list[0][0][
        0] == ModeBaselineBinaryPipeline
    assert start_iteration_callback.call_args_list[1][0][
        0] == nonlinear_binary_pipeline_class
def test_automl_allowed_pipelines_no_allowed_pipelines(automl_type, X_y_binary,
                                                       X_y_multi):
    is_multiclass = automl_type == ProblemTypes.MULTICLASS
    X, y = X_y_multi if is_multiclass else X_y_binary
    problem_type = 'multiclass' if is_multiclass else 'binary'
    with pytest.raises(ValueError, match="No allowed pipelines to search"):
        AutoMLSearch(X_train=X,
                     y_train=y,
                     problem_type=problem_type,
                     allowed_pipelines=None,
                     allowed_model_families=[])
def test_plot_iterations_ipython_mock(mock_ipython_display, X_y_binary):
    pytest.importorskip(
        'IPython.display',
        reason='Skipping plotting test because ipywidgets not installed')
    pytest.importorskip(
        'plotly.graph_objects',
        reason='Skipping plotting test because plotly not installed')
    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective="f1",
                          max_iterations=3,
                          n_jobs=1)
    automl.search()
    plot = automl.plot.search_iteration_plot(interactive_plot=True)
    assert isinstance(plot, SearchIterationPlot)
    assert isinstance(plot.data, AutoMLSearch)
    mock_ipython_display.assert_called_with(plot.best_score_by_iter_fig)
예제 #13
0
def test_cbm_objective_automl(optimize_thresholds, X_y_binary):
    X, y = X_y_binary
    cbm = CostBenefitMatrix(true_positive=10,
                            true_negative=-1,
                            false_positive=-7,
                            false_negative=-2)
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective=cbm,
                          max_iterations=2,
                          optimize_thresholds=optimize_thresholds)
    automl.search()

    pipeline = automl.best_pipeline
    pipeline.fit(X, y)
    predictions = pipeline.predict(X, cbm)
    assert not np.isnan(predictions.to_series()).values.any()
    assert not np.isnan(pipeline.predict_proba(X).to_dataframe()).values.any()
    assert not np.isnan(pipeline.score(X, y, [cbm])['Cost Benefit Matrix'])
def test_categorical_hyperparam(X_y_multi):
    X, y = X_y_multi

    class CustomPipeline(MulticlassClassificationPipeline):
        component_graph = [
            'Imputer', 'One Hot Encoder', 'Standard Scaler',
            'Logistic Regression Classifier'
        ]
        custom_hyperparameters = {
            'Simple Imputer': {
                'impute_strategy': Categorical(['mean', 'most_frequent'])
            }
        }

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type="multiclass",
                          allowed_pipelines=[CustomPipeline],
                          n_jobs=1)
    automl.search()
def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display, X_y_binary):
    pytest.importorskip('IPython.display', reason='Skipping plotting test because ipywidgets not installed')
    go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')
    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1)
    automl.search()

    mock_ipython_display.side_effect = ImportError('KABOOOOOOMMMM')
    plot = automl.plot.search_iteration_plot(interactive_plot=True)
    mock_ipython_display.assert_called_once()

    assert isinstance(plot, go.Figure)
    assert isinstance(plot.data, tuple)
    plot_data = plot.data[0]
    x = pd.Series(plot_data['x'])
    y = pd.Series(plot_data['y'])
    assert x.is_monotonic_increasing
    assert y.is_monotonic_increasing
    assert len(x) == 3
    assert len(y) == 3
def test_automl_allowed_pipelines_search(mock_fit, mock_score,
                                         dummy_binary_pipeline_class,
                                         X_y_binary):
    X, y = X_y_binary
    mock_score.return_value = {'Log Loss Binary': 1.0}

    allowed_pipelines = [dummy_binary_pipeline_class]
    start_iteration_callback = MagicMock()
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_iterations=2,
                          start_iteration_callback=start_iteration_callback,
                          allowed_pipelines=allowed_pipelines)
    automl.search()

    assert start_iteration_callback.call_count == 2
    assert start_iteration_callback.call_args_list[0][0][
        0] == ModeBaselineBinaryPipeline
    assert start_iteration_callback.call_args_list[1][0][
        0] == dummy_binary_pipeline_class
예제 #17
0
def test_log_metrics_only_passed_directly(X_y_regression):
    X, y = X_y_regression
    with pytest.raises(
            ObjectiveNotFoundError,
            match="RootMeanSquaredLogError is not a valid Objective!"):
        AutoMLSearch(X_train=X,
                     y_train=y,
                     problem_type='regression',
                     additional_objectives=[
                         'RootMeanSquaredLogError', 'MeanSquaredLogError'
                     ])

    ar = AutoMLSearch(X_train=X,
                      y_train=y,
                      problem_type='regression',
                      additional_objectives=[
                          RootMeanSquaredLogError(),
                          MeanSquaredLogError()
                      ])
    assert ar.additional_objectives[0].name == 'Root Mean Squared Log Error'
    assert ar.additional_objectives[1].name == 'Mean Squared Log Error'
예제 #18
0
def test_automl_allowed_pipelines_search(mock_fit, mock_score,
                                         dummy_regression_pipeline_class,
                                         X_y_regression):
    X, y = X_y_regression
    mock_score.return_value = {'R2': 1.0}

    allowed_pipelines = [dummy_regression_pipeline_class]
    start_iteration_callback = MagicMock()
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          max_iterations=2,
                          start_iteration_callback=start_iteration_callback,
                          allowed_pipelines=allowed_pipelines)
    automl.search()

    assert start_iteration_callback.call_count == 2
    assert start_iteration_callback.call_args_list[0][0][
        0] == MeanBaselineRegressionPipeline
    assert start_iteration_callback.call_args_list[1][0][
        0] == dummy_regression_pipeline_class
def test_automl_multiclass_nonlinear_pipeline_search_more_iterations(
        nonlinear_multiclass_pipeline_class, X_y_multi):
    X, y = X_y_multi

    allowed_pipelines = [nonlinear_multiclass_pipeline_class]
    start_iteration_callback = MagicMock()
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='multiclass',
                          max_iterations=5,
                          start_iteration_callback=start_iteration_callback,
                          allowed_pipelines=allowed_pipelines,
                          n_jobs=1)
    automl.search()

    assert start_iteration_callback.call_args_list[0][0][
        0] == ModeBaselineMulticlassPipeline
    assert start_iteration_callback.call_args_list[1][0][
        0] == nonlinear_multiclass_pipeline_class
    assert start_iteration_callback.call_args_list[4][0][
        0] == nonlinear_multiclass_pipeline_class
def test_multi_objective(X_y_multi):
    X, y = X_y_multi
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective="Log Loss Binary")
    assert automl.problem_type == ProblemTypes.BINARY

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='multiclass',
                          objective="Log Loss Multiclass")
    assert automl.problem_type == ProblemTypes.MULTICLASS

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='multiclass',
                          objective='AUC Micro')
    assert automl.problem_type == ProblemTypes.MULTICLASS

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective='AUC')
    assert automl.problem_type == ProblemTypes.BINARY

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='multiclass')
    assert automl.problem_type == ProblemTypes.MULTICLASS

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary')
    assert automl.problem_type == ProblemTypes.BINARY
def test_optimizable_threshold_enabled(mock_fit, mock_score,
                                       mock_predict_proba, mock_encode_targets,
                                       mock_optimize_threshold, X_y_binary,
                                       caplog):
    mock_optimize_threshold.return_value = 0.8
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective='precision',
                          max_iterations=1,
                          optimize_thresholds=True)
    mock_score.return_value = {'precision': 1.0}
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    mock_predict_proba.assert_called()
    mock_optimize_threshold.assert_called()
    assert automl.best_pipeline.threshold == 0.8
    assert automl.results['pipeline_results'][0]['cv_data'][0].get(
        'binary_classification_threshold') == 0.8
    assert automl.results['pipeline_results'][0]['cv_data'][1].get(
        'binary_classification_threshold') == 0.8
    assert automl.results['pipeline_results'][0]['cv_data'][2].get(
        'binary_classification_threshold') == 0.8

    automl.describe_pipeline(0)
    out = caplog.text
    assert "Objective to optimize binary classification pipeline thresholds for" in out
예제 #22
0
def test_automl_time_series_classification_pickle_generated_pipeline(
        mock_binary_fit, mock_multi_fit, mock_binary_score,
        mock_multiclass_score, problem_type, X_y_binary, X_y_multi):
    if problem_type == ProblemTypes.TIME_SERIES_BINARY:
        X, y = X_y_binary
        pipeline = GeneratedPipelineTimeSeriesBinary
    else:
        X, y = X_y_multi
        pipeline = GeneratedPipelineTimeSeriesMulticlass

    configuration = {
        "gap": 0,
        "max_delay": 0,
        'delay_target': False,
        'delay_features': True
    }
    a = AutoMLSearch(X_train=X,
                     y_train=y,
                     problem_type=problem_type,
                     problem_configuration=configuration)
    a.search()

    for i, row in a.rankings.iterrows():
        assert a.get_pipeline(row['id']).__class__ == pipeline
        assert pickle.loads(pickle.dumps(a.get_pipeline(row['id'])))
예제 #23
0
def test_plot_iterations_max_iterations(X_y_regression):
    go = pytest.importorskip(
        'plotly.graph_objects',
        reason='Skipping plotting test because plotly not installed')
    X, y = X_y_regression

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          max_iterations=3,
                          n_jobs=1)
    automl.search()
    plot = automl.plot.search_iteration_plot()
    plot_data = plot.data[0]
    x = pd.Series(plot_data['x'])
    y = pd.Series(plot_data['y'])

    assert isinstance(plot, go.Figure)
    assert x.is_monotonic_increasing
    assert y.is_monotonic_increasing
    assert len(x) == 3
    assert len(y) == 3
예제 #24
0
def test_plot_disabled_missing_dependency(X_y_regression,
                                          has_minimal_dependencies):
    X, y = X_y_regression

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          max_iterations=3)
    if has_minimal_dependencies:
        with pytest.raises(AttributeError):
            automl.plot.search_iteration_plot
    else:
        automl.plot.search_iteration_plot
def test_callback(X_y_regression):
    X, y = X_y_regression

    counts = {
        "start_iteration_callback": 0,
        "add_result_callback": 0,
    }

    def start_iteration_callback(pipeline_class, parameters, automl_obj, counts=counts):
        counts["start_iteration_callback"] += 1

    def add_result_callback(results, trained_pipeline, automl_obj, counts=counts):
        counts["add_result_callback"] += 1

    max_iterations = 3
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=max_iterations,
                          start_iteration_callback=start_iteration_callback,
                          add_result_callback=add_result_callback, n_jobs=1)
    automl.search()

    assert counts["start_iteration_callback"] == max_iterations
    assert counts["add_result_callback"] == max_iterations
예제 #26
0
def test_automl_allowed_pipelines_init_allowed_both_not_specified(
        mock_fit, mock_score, X_y_regression,
        assert_allowed_pipelines_equal_helper):
    X, y = X_y_regression
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          allowed_pipelines=None,
                          allowed_model_families=None)
    mock_score.return_value = {automl.objective.name: 1.0}
    expected_pipelines = [
        make_pipeline(X, y, estimator, ProblemTypes.REGRESSION)
        for estimator in get_estimators(ProblemTypes.REGRESSION,
                                        model_families=None)
    ]
    assert_allowed_pipelines_equal_helper(automl.allowed_pipelines,
                                          expected_pipelines)
    assert set(automl.allowed_model_families) == set(
        [p.model_family for p in expected_pipelines])
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
def test_automl_allowed_pipelines_init_allowed_both_specified_multi(
        mock_fit, mock_score, dummy_multiclass_pipeline_class, X_y_multi,
        assert_allowed_pipelines_equal_helper):
    X, y = X_y_multi
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='multiclass',
                          allowed_pipelines=[dummy_multiclass_pipeline_class],
                          allowed_model_families=[ModelFamily.RANDOM_FOREST])
    mock_score.return_value = {automl.objective.name: 1.0}
    expected_pipelines = [dummy_multiclass_pipeline_class]
    assert automl.allowed_pipelines == expected_pipelines
    # the dummy multiclass pipeline estimator has model family NONE
    assert set(automl.allowed_model_families) == set([ModelFamily.NONE])

    automl.search()
    assert_allowed_pipelines_equal_helper(automl.allowed_pipelines,
                                          expected_pipelines)
    assert set(automl.allowed_model_families) == set(
        [p.model_family for p in expected_pipelines])
    mock_fit.assert_called()
    mock_score.assert_called()
def test_early_stopping(caplog, linear_regression_pipeline_class, X_y_regression):
    X, y = X_y_regression
    tolerance = 0.005
    patience = 2
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective='mse', max_time='60 seconds',
                          patience=patience, tolerance=tolerance,
                          allowed_model_families=['linear_model'], random_seed=0, n_jobs=1)

    mock_results = {
        'search_order': [0, 1, 2],
        'pipeline_results': {}
    }

    scores = [150, 200, 195]
    for id in mock_results['search_order']:
        mock_results['pipeline_results'][id] = {}
        mock_results['pipeline_results'][id]['score'] = scores[id]
        mock_results['pipeline_results'][id]['pipeline_class'] = linear_regression_pipeline_class

    automl._results = mock_results
    automl._check_stopping_condition(time.time())
    out = caplog.text
    assert "2 iterations without improvement. Stopping search early." in out
def test_plot_iterations_max_time(X_y_binary):
    go = pytest.importorskip(
        'plotly.graph_objects',
        reason='Skipping plotting test because plotly not installed')
    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective="f1",
                          max_time=10,
                          n_jobs=1)
    automl.search(show_iteration_plot=False)
    plot = automl.plot.search_iteration_plot()
    plot_data = plot.data[0]
    x = pd.Series(plot_data['x'])
    y = pd.Series(plot_data['y'])

    assert isinstance(plot, go.Figure)
    assert x.is_monotonic_increasing
    assert y.is_monotonic_increasing
    assert len(x) > 0
    assert len(y) > 0
def test_data_splitter(X_y_binary):
    X, y = X_y_binary
    cv_folds = 5
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=BalancedClassificationDataCVSplit(n_splits=cv_folds), max_iterations=1,
                          n_jobs=1)
    automl.search()

    assert isinstance(automl.rankings, pd.DataFrame)
    assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', data_splitter=TimeSeriesSplit(n_splits=cv_folds),
                          max_iterations=1, n_jobs=1)
    automl.search()

    assert isinstance(automl.rankings, pd.DataFrame)
    assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds