Пример #1
0
def test_partial_dependence_baseline():
    X = pd.DataFrame([[1, 0], [0, 1]])
    y = pd.Series([0, 1])
    pipeline = BinaryClassificationPipeline(component_graph=["Baseline Classifier"], parameters={})
    pipeline.fit(X, y)
    with pytest.raises(ValueError, match="Partial dependence plots are not supported for Baseline pipelines"):
        partial_dependence(pipeline, X, features=0, grid_resolution=20)
Пример #2
0
    def test_submit_scoring_jobs_multiple(self):
        """ Test that scoring multiple pipelines using the parallel engine produces the
        same results as the sequential engine. """
        X, y = self.X_y_binary
        pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                  parameters={"Logistic Regression Classifier": {"n_jobs": 1}}),
                     BinaryClassificationPipeline(component_graph=["Baseline Classifier"]),
                     BinaryClassificationPipeline(component_graph=["SVM Classifier"])]

        def score_pipelines(pipelines, engine):
            futures = []
            for pipeline in pipelines:
                futures.append(engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                          automl_config=automl_data, pipeline=pipeline))
            pipelines = [f.get_result() for f in futures]
            futures = []
            for pipeline in pipelines:
                futures.append(engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                         automl_config=automl_data, pipeline=pipeline,
                                                         objectives=[automl_data.objective]))
            results = [f.get_result() for f in futures]
            return results

        par_eval_results = score_pipelines(pipelines, DaskEngine(client=self.client))
        par_scores = [s["Log Loss Binary"] for s in par_eval_results]

        seq_eval_results = score_pipelines(pipelines, SequentialEngine())
        seq_scores = [s["Log Loss Binary"] for s in seq_eval_results]

        # Check there are the proper number of pipelines and all their scores are same.
        assert len(par_eval_results) == len(pipelines)
        assert set(par_scores) == set(seq_scores)
Пример #3
0
    def test_submit_training_jobs_multiple(self):
        """ Test that training multiple pipelines using the parallel engine produces the
        same results as the sequential engine. """
        X, y = self.X_y_binary
        pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                  parameters={"Logistic Regression Classifier": {"n_jobs": 1}}),
                     BinaryClassificationPipeline(component_graph=["Baseline Classifier"]),
                     BinaryClassificationPipeline(component_graph=["SVM Classifier"])]

        def fit_pipelines(pipelines, engine):
            futures = []
            for pipeline in pipelines:
                futures.append(engine.submit_training_job(X=X, y=y, automl_config=automl_data, pipeline=pipeline))
            results = [f.get_result() for f in futures]
            return results

        # Verify all pipelines are trained and fitted.
        seq_pipelines = fit_pipelines(pipelines, SequentialEngine())
        for pipeline in seq_pipelines:
            assert pipeline._is_fitted

        # Verify all pipelines are trained and fitted.
        par_pipelines = fit_pipelines(pipelines, DaskEngine(client=self.client))
        for pipeline in par_pipelines:
            assert pipeline._is_fitted

        # Ensure sequential and parallel pipelines are equivalent
        assert len(par_pipelines) == len(seq_pipelines)
        for par_pipeline in par_pipelines:
            assert par_pipeline in seq_pipelines
Пример #4
0
    def _method(hyperparameters=['default', 'other']):
        class MockEstimator(Estimator):
            name = "Mock Classifier"
            model_family = ModelFamily.RANDOM_FOREST
            supported_problem_types = [
                ProblemTypes.BINARY, ProblemTypes.MULTICLASS
            ]
            if isinstance(hyperparameters,
                          (list, tuple, Real, Categorical, Integer)):
                hyperparameter_ranges = {'dummy_parameter': hyperparameters}
            else:
                hyperparameter_ranges = {'dummy_parameter': [hyperparameters]}

            def __init__(self,
                         dummy_parameter='default',
                         n_jobs=-1,
                         random_seed=0,
                         **kwargs):
                super().__init__(parameters={
                    'dummy_parameter': dummy_parameter,
                    **kwargs, 'n_jobs': n_jobs
                },
                                 component_obj=None,
                                 random_seed=random_seed)

        return [
            BinaryClassificationPipeline([MockEstimator]),
            BinaryClassificationPipeline([MockEstimator]),
            BinaryClassificationPipeline([MockEstimator])
        ]
Пример #5
0
def test_partial_dependence_respect_grid_resolution():
    X, y = load_fraud(1000)

    pl = BinaryClassificationPipeline(component_graph=["DateTime Featurization Component", "One Hot Encoder", "Random Forest Classifier"])
    pl.fit(X, y)
    dep = partial_dependence(pl, X, features="amount", grid_resolution=20)

    assert dep.shape[0] == 20
    assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1

    dep = partial_dependence(pl, X, features="provider", grid_resolution=20)
    assert dep.shape[0] == X['provider'].to_series().nunique()
    assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1
Пример #6
0
def test_graph_partial_dependence_regression_date_order(X_y_binary):
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')

    X, y = X_y_binary
    pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier'])
    X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    dt_series = pd.Series(pd.date_range('20200101', periods=X.shape[0])).sample(frac=1).reset_index(drop=True)
    X['dt_column'] = pd.to_datetime(dt_series, errors='coerce')

    pipeline.fit(X, y)

    fig = graph_partial_dependence(pipeline, X, features='dt_column', grid_resolution=5)
    plot_data = fig.to_dict()['data'][0]
    assert plot_data['type'] == 'scatter'
    assert plot_data['x'].tolist() == list(pd.date_range('20200101', periods=X.shape[0]))
Пример #7
0
def test_generate_code_pipeline_with_custom_components():
    class CustomTransformer(Transformer):
        name = "My Custom Transformer"
        hyperparameter_ranges = {}

        def __init__(self, random_seed=0):
            parameters = {}

            super().__init__(parameters=parameters,
                             component_obj=None,
                             random_seed=random_seed)

    class CustomEstimator(Estimator):
        name = "My Custom Estimator"
        hyperparameter_ranges = {}
        supported_problem_types = [
            ProblemTypes.BINARY, ProblemTypes.MULTICLASS
        ]
        model_family = ModelFamily.NONE

        def __init__(self, random_arg=False, random_seed=0):
            parameters = {'random_arg': random_arg}

            super().__init__(parameters=parameters,
                             component_obj=None,
                             random_seed=random_seed)

    mock_pipeline_with_custom_components = BinaryClassificationPipeline(
        [CustomTransformer, CustomEstimator])
    expected_code = "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \
        "pipeline = BinaryClassificationPipeline(component_graph=[CustomTransformer, CustomEstimator], " \
        "parameters={'My Custom Estimator':{'random_arg': False}}, random_seed=0)"
    pipeline = generate_pipeline_code(mock_pipeline_with_custom_components)
    assert pipeline == expected_code
Пример #8
0
def test_partial_dependence_xgboost_feature_names(problem_type, has_minimal_dependencies,
                                                  X_y_binary, X_y_multi, X_y_regression):
    if has_minimal_dependencies:
        pytest.skip("Skipping because XGBoost not installed for minimal dependencies")
    if problem_type == ProblemTypes.REGRESSION:
        pipeline = RegressionPipeline(component_graph=['Simple Imputer', 'XGBoost Regressor'],
                                      parameters={'XGBoost Classifier': {'nthread': 1}})
        X, y = X_y_regression
    elif problem_type == ProblemTypes.BINARY:
        pipeline = BinaryClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'],
                                                parameters={'XGBoost Classifier': {'nthread': 1}})
        X, y = X_y_binary
    elif problem_type == ProblemTypes.MULTICLASS:
        pipeline = MulticlassClassificationPipeline(component_graph=['Simple Imputer', 'XGBoost Classifier'],
                                                    parameters={'XGBoost Classifier': {'nthread': 1}})
        X, y = X_y_multi

    X = pd.DataFrame(X)
    X = X.rename(columns={0: '<[0]'})
    pipeline.fit(X, y)
    part_dep = partial_dependence(pipeline, X, features="<[0]", grid_resolution=20)
    check_partial_dependence_dataframe(pipeline, part_dep)
    assert not part_dep.isnull().all().all()

    part_dep = partial_dependence(pipeline, X, features=1, grid_resolution=20)
    check_partial_dependence_dataframe(pipeline, part_dep)
    assert not part_dep.isnull().all().all()
def test_stacked_different_input_pipelines_regression():
    input_pipelines = [
        RegressionPipeline([RandomForestRegressor]),
        BinaryClassificationPipeline([RandomForestClassifier])
    ]
    with pytest.raises(ValueError,
                       match="All pipelines must have the same problem type."):
        StackedEnsembleRegressor(input_pipelines=input_pipelines)
Пример #10
0
def test_evaluate_pipeline_handles_ensembling_indices(
        mock_fit, mock_score, dummy_binary_pipeline_class,
        stackable_classifiers):
    X = ww.DataTable(pd.DataFrame({"a": [i for i in range(100)]}))
    y = ww.DataColumn(pd.Series([i % 2 for i in range(100)]))

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_batches=19,
                          ensembling=True,
                          _ensembling_split_size=0.25)

    training_indices, ensembling_indices, _, _ = split_data(
        ww.DataTable(np.arange(X.shape[0])),
        y,
        problem_type='binary',
        test_size=0.25,
        random_seed=0)
    training_indices, ensembling_indices = training_indices.to_dataframe(
    )[0].tolist(), ensembling_indices.to_dataframe()[0].tolist()

    pipeline1 = dummy_binary_pipeline_class({'Mock Classifier': {'a': 1}})

    _ = evaluate_pipeline(pipeline1, automl, X, y, logger=MagicMock())
    # check the fit length is correct, taking into account the data splits
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(training_indices))

    input_pipelines = [
        BinaryClassificationPipeline([classifier])
        for classifier in stackable_classifiers
    ]
    ensemble = BinaryClassificationPipeline(
        [StackedEnsembleClassifier],
        parameters={
            "Stacked Ensemble Classifier": {
                "input_pipelines": input_pipelines,
                "n_jobs": 1
            }
        })

    _ = evaluate_pipeline(ensemble, automl, X, y, logger=MagicMock())
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 *
                                                len(ensembling_indices))
Пример #11
0
def test_generate_code_pipeline_json_with_objects():
    class CustomEstimator(Estimator):
        name = "My Custom Estimator"
        hyperparameter_ranges = {}
        supported_problem_types = [
            ProblemTypes.BINARY, ProblemTypes.MULTICLASS
        ]
        model_family = ModelFamily.NONE

        def __init__(self, random_arg=False, numpy_arg=[], random_seed=0):
            parameters = {'random_arg': random_arg, 'numpy_arg': numpy_arg}

            super().__init__(parameters=parameters,
                             component_obj=None,
                             random_seed=random_seed)

    component_graph = ['Imputer', CustomEstimator]
    pipeline = BinaryClassificationPipeline(
        component_graph,
        custom_name="Mock Binary Pipeline with Transformer",
        parameters={'My Custom Estimator': {
            'numpy_arg': np.array([0])
        }})
    generated_pipeline_code = generate_pipeline_code(pipeline)
    assert generated_pipeline_code == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \
        "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " \
        "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \
        "'My Custom Estimator':{'random_arg': False, 'numpy_arg': array([0])}}, custom_name='Mock Binary Pipeline with Transformer', random_seed=0)"

    pipeline = BinaryClassificationPipeline(
        component_graph,
        custom_name="Mock Binary Pipeline with Transformer",
        parameters={'My Custom Estimator': {
            'random_arg': Imputer()
        }})
    generated_pipeline_code = generate_pipeline_code(pipeline)
    assert generated_pipeline_code == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \
        "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " \
        "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \
        "'My Custom Estimator':{'random_arg': Imputer(categorical_impute_strategy='most_frequent', numeric_impute_strategy='mean', categorical_fill_value=None, numeric_fill_value=None), 'numpy_arg': []}}, " \
        "custom_name='Mock Binary Pipeline with Transformer', random_seed=0)"
Пример #12
0
    def test_submit_scoring_job_single(self):
        """ Test that scoring a single pipeline using the parallel engine produces the
        same results as simply running the score_pipeline function. """
        X, y = self.X_y_binary
        pipeline = BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
        engine = DaskEngine(client=self.client)
        objectives = [automl_data.objective]

        pipeline_future = engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                     automl_config=automl_data, pipeline=pipeline)
        pipeline = pipeline_future.get_result()
        pipeline_score_future = engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                          automl_config=automl_data, pipeline=pipeline,
                                                          objectives=objectives)
        assert isinstance(pipeline_score_future, DaskComputation)
        pipeline_score = pipeline_score_future.get_result()

        original_pipeline_score = pipeline.score(X=X, y=y, objectives=objectives)

        assert not np.isnan(pipeline_score["Log Loss Binary"])
        assert pipeline_score == original_pipeline_score
Пример #13
0
    def test_submit_evaluate_jobs_multiple(self):
        """ Test that evaluating multiple pipelines using the parallel engine produces the
        same results as the sequential engine. """
        X, y = self.X_y_binary
        pipelines = [BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                  parameters={"Logistic Regression Classifier": {"n_jobs": 1}}),
                     BinaryClassificationPipeline(component_graph=["Baseline Classifier"]),
                     BinaryClassificationPipeline(component_graph=["SVM Classifier"])]

        def eval_pipelines(pipelines, engine):
            futures = []
            for pipeline in pipelines:
                futures.append(engine.submit_evaluation_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                            automl_config=automl_data, pipeline=pipeline))
            results = [f.get_result() for f in futures]
            return results

        par_eval_results = eval_pipelines(pipelines, DaskEngine(client=self.client))
        par_dicts = [s.get("scores") for s in par_eval_results]
        par_scores = [s["cv_data"][0]["mean_cv_score"] for s in par_dicts]
        par_pipelines = [s.get("pipeline") for s in par_eval_results]

        seq_eval_results = eval_pipelines(pipelines, SequentialEngine())
        seq_dicts = [s.get("scores") for s in seq_eval_results]
        seq_scores = [s["cv_data"][0]["mean_cv_score"] for s in seq_dicts]
        seq_pipelines = [s.get("pipeline") for s in seq_eval_results]

        # Ensure all pipelines are fitted.
        assert all([s._is_fitted for s in par_pipelines])

        # Ensure the scores in parallel and sequence are same
        assert set(par_scores) == set(seq_scores)
        assert not any([np.isnan(s) for s in par_scores])

        # Ensure the parallel and sequence pipelines match
        assert len(par_pipelines) == len(seq_pipelines)
        for par_pipeline in par_pipelines:
            assert par_pipeline in seq_pipelines
Пример #14
0
def test_compute_shap_values_catches_shap_tree_warnings(mock_tree_explainer, mock_debug, X_y_binary, caplog):
    X, y = X_y_binary
    pipeline = BinaryClassificationPipeline(["Random Forest Classifier"])

    def raise_warning_from_shap(estimator, feature_perturbation):
        warnings.warn("Shap raised a warning!")
        mock = MagicMock()
        mock.shap_values.return_value = np.zeros(10)
        return mock

    mock_tree_explainer.side_effect = raise_warning_from_shap

    _ = _compute_shap_values(pipeline, pd.DataFrame(X))
    mock_debug.debug.assert_called_with("_compute_shap_values TreeExplainer: Shap raised a warning!")
Пример #15
0
    def test_submit_training_job_single(self):
        """ Test that training a single pipeline using the parallel engine produces the
        same results as simply running the train_pipeline function. """
        X, y = self.X_y_binary
        engine = DaskEngine(client=self.client)
        pipeline = BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                parameters={"Logistic Regression Classifier": {"n_jobs": 1}})

        # Verify that engine fits a pipeline
        pipeline_future = engine.submit_training_job(X=X, y=y, automl_config=automl_data, pipeline=pipeline)
        dask_pipeline_fitted = pipeline_future.get_result()
        assert dask_pipeline_fitted._is_fitted

        # Verify parallelization has no effect on output of function
        original_pipeline_fitted = train_pipeline(pipeline, X, y, optimize_thresholds=automl_data.optimize_thresholds,
                                                  objective=automl_data.objective)
        assert dask_pipeline_fitted == original_pipeline_fitted
        assert dask_pipeline_fitted.predict(X) == original_pipeline_fitted.predict(X)
Пример #16
0
def test_generate_code_pipeline():
    custom_hyperparameters = {
        "Imputer": {
            "numeric_impute_strategy": 'most_frequent'
        }
    }

    binary_pipeline = BinaryClassificationPipeline(
        ['Imputer', 'Random Forest Classifier'],
        custom_hyperparameters=custom_hyperparameters)
    expected_code = "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \
        "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'Random Forest Classifier'], " \
        "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \
        "'Random Forest Classifier':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_hyperparameters={'Imputer':{'numeric_impute_strategy': 'most_frequent'}}, random_seed=0)"
    pipeline = generate_pipeline_code(binary_pipeline)
    assert expected_code == pipeline

    regression_pipeline = RegressionPipeline(
        ['Imputer', 'Random Forest Regressor'],
        custom_name="Mock Regression Pipeline")
    expected_code = "from evalml.pipelines.regression_pipeline import RegressionPipeline\n" \
        "pipeline = RegressionPipeline(component_graph=['Imputer', 'Random Forest Regressor'], parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \
        "'Random Forest Regressor':{'n_estimators': 100, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Regression Pipeline', random_seed=0)"
    pipeline = generate_pipeline_code(regression_pipeline)
    assert pipeline == expected_code

    regression_pipeline_with_params = RegressionPipeline(
        ['Imputer', 'Random Forest Regressor'],
        custom_name="Mock Regression Pipeline",
        parameters={
            "Imputer": {
                "numeric_impute_strategy": "most_frequent"
            },
            "Random Forest Regressor": {
                "n_estimators": 50
            }
        })
    expected_code_params = "from evalml.pipelines.regression_pipeline import RegressionPipeline\n" \
        "pipeline = RegressionPipeline(component_graph=['Imputer', 'Random Forest Regressor'], " \
        "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'most_frequent', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \
        "'Random Forest Regressor':{'n_estimators': 50, 'max_depth': 6, 'n_jobs': -1}}, custom_name='Mock Regression Pipeline', random_seed=0)"
    pipeline = generate_pipeline_code(regression_pipeline_with_params)
    assert pipeline == expected_code_params
Пример #17
0
    def test_submit_evaluate_job_single(self):
        """ Test that evaluating a single pipeline using the parallel engine produces the
        same results as simply running the evaluate_pipeline function. """
        X, y = self.X_y_binary
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        pipeline = BinaryClassificationPipeline(component_graph=["Logistic Regression Classifier"],
                                                parameters={"Logistic Regression Classifier": {"n_jobs": 1}})

        engine = DaskEngine(client=self.client)

        # Verify that engine evaluates a pipeline
        pipeline_future = engine.submit_evaluation_job(X=X, y=y,
                                                       automl_config=automl_data, pipeline=pipeline)
        assert isinstance(pipeline_future, DaskComputation)

        par_eval_results = pipeline_future.get_result()

        original_eval_results = evaluate_pipeline(pipeline, automl_config=automl_data, X=X, y=y, logger=JobLogger())

        # Ensure we get back the same output as the parallelized function.
        assert len(par_eval_results) == 3

        par_scores = par_eval_results.get("scores")
        original_eval_scores = original_eval_results.get("scores")

        # Compare cross validation information except training time.
        assert par_scores["cv_data"] == original_eval_scores["cv_data"]
        assert all(par_scores["cv_scores"] == original_eval_scores["cv_scores"])
        assert par_scores["cv_score_mean"] == par_scores["cv_score_mean"]

        # Make sure the resulting pipelines are the same.
        assert isinstance(par_eval_results.get("pipeline"), PipelineBase)
        assert par_eval_results.get("pipeline") == original_eval_results.get("pipeline")

        # Make sure a properly filled logger comes back.
        assert isinstance(par_eval_results.get("logger"), JobLogger)
        assert par_eval_results.get("logger").logs == original_eval_results.get("logger").logs
Пример #18
0
def test_partial_dependence_datetime(problem_type, X_y_regression, X_y_binary, X_y_multi):
    if problem_type == 'binary':
        X, y = X_y_binary
        pipeline = BinaryClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier'])
    elif problem_type == 'multiclass':
        X, y = X_y_multi
        pipeline = MulticlassClassificationPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Logistic Regression Classifier'])
    else:
        X, y = X_y_regression
        pipeline = RegressionPipeline(component_graph=['Imputer', 'One Hot Encoder', 'DateTime Featurization Component', 'Standard Scaler', 'Linear Regressor'])

    X = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y = pd.Series(y)
    X['dt_column'] = pd.Series(pd.date_range('20200101', periods=X.shape[0]))

    pipeline.fit(X, y)
    part_dep = partial_dependence(pipeline, X, features='dt_column')
    if problem_type == 'multiclass':
        assert len(part_dep["partial_dependence"]) == 300  # 100 rows * 3 classes
        assert len(part_dep["feature_values"]) == 300
    else:
        assert len(part_dep["partial_dependence"]) == 100
        assert len(part_dep["feature_values"]) == 100
    assert not part_dep.isnull().any(axis=None)

    part_dep = partial_dependence(pipeline, X, features=20)
    if problem_type == 'multiclass':
        assert len(part_dep["partial_dependence"]) == 300  # 100 rows * 3 classes
        assert len(part_dep["feature_values"]) == 300
    else:
        assert len(part_dep["partial_dependence"]) == 100
        assert len(part_dep["feature_values"]) == 100
    assert not part_dep.isnull().any(axis=None)

    with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'):
        part_dep = partial_dependence(pipeline, X, features=('0', 'dt_column'))
    with pytest.raises(ValueError, match='Two-way partial dependence is not supported for datetime columns.'):
        part_dep = partial_dependence(pipeline, X, features=(0, 20))
Пример #19
0
def test_iterative_algorithm_frozen_parameters():
    class MockEstimator(Estimator):
        name = "Mock Classifier"
        model_family = ModelFamily.RANDOM_FOREST
        supported_problem_types = [
            ProblemTypes.BINARY, ProblemTypes.MULTICLASS
        ]
        hyperparameter_ranges = {
            'dummy_int_parameter': Integer(1, 10),
            'dummy_categorical_parameter':
            Categorical(["random", "dummy", "test"]),
            'dummy_real_parameter': Real(0, 1)
        }

        def __init__(self,
                     dummy_int_parameter=0,
                     dummy_categorical_parameter='dummy',
                     dummy_real_parameter=1.0,
                     n_jobs=-1,
                     random_seed=0,
                     **kwargs):
            super().__init__(parameters={
                'dummy_int_parameter': dummy_int_parameter,
                'dummy_categorical_parameter': dummy_categorical_parameter,
                'dummy_real_parameter': dummy_real_parameter,
                **kwargs, 'n_jobs': n_jobs
            },
                             component_obj=None,
                             random_seed=random_seed)

    pipeline = BinaryClassificationPipeline([MockEstimator])
    algo = IterativeAlgorithm(allowed_pipelines=[pipeline, pipeline, pipeline],
                              pipeline_params={
                                  'pipeline': {
                                      'date_index': "Date",
                                      "gap": 2,
                                      "max_delay": 10
                                  }
                              },
                              random_seed=0,
                              _frozen_pipeline_parameters={
                                  "Mock Classifier": {
                                      'dummy_int_parameter': 6,
                                      'dummy_categorical_parameter': "random",
                                      'dummy_real_parameter': 0.1
                                  }
                              })

    next_batch = algo.next_batch()
    assert all([
        p.parameters['pipeline'] == {
            'date_index': "Date",
            "gap": 2,
            "max_delay": 10
        } for p in next_batch
    ])
    assert all([
        p.parameters['Mock Classifier'] == {
            'dummy_int_parameter': 6,
            'dummy_categorical_parameter': "random",
            'dummy_real_parameter': 0.1,
            "n_jobs": -1
        } for p in next_batch
    ])

    scores = np.arange(0, len(next_batch))
    for score, pipeline in zip(scores, next_batch):
        algo.add_result(score, pipeline, {"id": algo.pipeline_number})

    # make sure that future batches remain in the hyperparam range
    for i in range(1, 5):
        next_batch = algo.next_batch()
        assert all([
            p.parameters['Mock Classifier'] == {
                'dummy_int_parameter': 6,
                'dummy_categorical_parameter': "random",
                'dummy_real_parameter': 0.1,
                "n_jobs": -1
            } for p in next_batch
        ])
Пример #20
0
def decision_tree_classification_pipeline_class(X_y_categorical_classification):
    pipeline = BinaryClassificationPipeline(['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Decision Tree Classifier'])
    X, y = X_y_categorical_classification
    pipeline.fit(X, y)
    return pipeline
Пример #21
0
def test_stacked_estimator_in_pipeline(
        problem_type, X_y_binary, X_y_multi, X_y_regression,
        stackable_classifiers, stackable_regressors,
        logistic_regression_binary_pipeline_class,
        logistic_regression_multiclass_pipeline_class,
        linear_regression_pipeline_class):
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
        base_pipeline_class = BinaryClassificationPipeline
        stacking_component_name = StackedEnsembleClassifier.name
        input_pipelines = [
            BinaryClassificationPipeline([classifier])
            for classifier in stackable_classifiers
        ]
        comparison_pipeline = logistic_regression_binary_pipeline_class(
            parameters={"Logistic Regression Classifier": {
                "n_jobs": 1
            }})
        objective = 'Log Loss Binary'
    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
        base_pipeline_class = MulticlassClassificationPipeline
        stacking_component_name = StackedEnsembleClassifier.name
        input_pipelines = [
            MulticlassClassificationPipeline([classifier])
            for classifier in stackable_classifiers
        ]
        comparison_pipeline = logistic_regression_multiclass_pipeline_class(
            parameters={"Logistic Regression Classifier": {
                "n_jobs": 1
            }})
        objective = 'Log Loss Multiclass'
    elif problem_type == ProblemTypes.REGRESSION:
        X, y = X_y_regression
        base_pipeline_class = RegressionPipeline
        stacking_component_name = StackedEnsembleRegressor.name
        input_pipelines = [
            RegressionPipeline([regressor])
            for regressor in stackable_regressors
        ]
        comparison_pipeline = linear_regression_pipeline_class(
            parameters={"Linear Regressor": {
                "n_jobs": 1
            }})
        objective = 'R2'
    parameters = {
        stacking_component_name: {
            "input_pipelines": input_pipelines,
            "n_jobs": 1
        }
    }
    graph = ['Simple Imputer', stacking_component_name]

    pipeline = base_pipeline_class(component_graph=graph,
                                   parameters=parameters)
    pipeline.fit(X, y)
    comparison_pipeline.fit(X, y)
    assert not np.isnan(pipeline.predict(X).to_series()).values.any()

    pipeline_score = pipeline.score(X, y, [objective])[objective]
    comparison_pipeline_score = comparison_pipeline.score(
        X, y, [objective])[objective]

    if problem_type == ProblemTypes.BINARY or problem_type == ProblemTypes.MULTICLASS:
        assert not np.isnan(
            pipeline.predict_proba(X).to_dataframe()).values.any()
        assert (pipeline_score <= comparison_pipeline_score)
    else:
        assert (pipeline_score >= comparison_pipeline_score)
def test_binary_init():
    clf = BinaryClassificationPipeline(component_graph=[
        "Imputer", "One Hot Encoder", "Random Forest Classifier"
    ])
    assert clf.parameters == {
        'Imputer': {
            'categorical_impute_strategy': 'most_frequent',
            'numeric_impute_strategy': 'mean',
            'categorical_fill_value': None,
            'numeric_fill_value': None
        },
        'One Hot Encoder': {
            'top_n': 10,
            'features_to_encode': None,
            'categories': None,
            'drop': 'if_binary',
            'handle_unknown': 'ignore',
            'handle_missing': 'error'
        },
        'Random Forest Classifier': {
            'n_estimators': 100,
            'max_depth': 6,
            'n_jobs': -1
        }
    }
    assert clf.custom_hyperparameters is None
    assert clf.name == "Random Forest Classifier w/ Imputer + One Hot Encoder"
    assert clf.random_seed == 0
    custom_hyperparameters = {
        "Imputer": {
            "numeric_impute_strategy": Categorical(["most_frequent", 'mean'])
        },
        "Imputer_1": {
            "numeric_impute_strategy": Categorical(["median", 'mean'])
        },
        "Random Forest Classifier": {
            "n_estimators": Categorical([50, 100])
        }
    }
    parameters = {"One Hot Encoder": {"top_n": 20}}
    clf = BinaryClassificationPipeline(
        component_graph=[
            "Imputer", "One Hot Encoder", "Random Forest Classifier"
        ],
        parameters=parameters,
        custom_hyperparameters=custom_hyperparameters,
        custom_name="Custom Pipeline",
        random_seed=42)

    assert clf.parameters == {
        'Imputer': {
            'categorical_impute_strategy': 'most_frequent',
            'numeric_impute_strategy': 'mean',
            'categorical_fill_value': None,
            'numeric_fill_value': None
        },
        'One Hot Encoder': {
            'top_n': 20,
            'features_to_encode': None,
            'categories': None,
            'drop': 'if_binary',
            'handle_unknown': 'ignore',
            'handle_missing': 'error'
        },
        'Random Forest Classifier': {
            'n_estimators': 100,
            'max_depth': 6,
            'n_jobs': -1
        }
    }
    assert clf.custom_hyperparameters == custom_hyperparameters
    assert clf.name == "Custom Pipeline"
    assert clf.random_seed == 42
Пример #23
0
def test_stacked_ensemble_nonstackable_model_families():
    with pytest.raises(ValueError, match="Pipelines with any of the following model families cannot be used as base pipelines"):
        StackedEnsembleClassifier(input_pipelines=[BinaryClassificationPipeline([BaselineClassifier])])