예제 #1
0
def test_make_pipeline_no_nulls(input_type, problem_type):
    X = pd.DataFrame({
        "numerical": [1, 2, 3, 1, 2],
        "categorical": ["a", "b", "a", "c", "c"],
        "some dates":
        pd.date_range('2000-02-03', periods=5, freq='W')
    })
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [
                    OneHotEncoder, StandardScaler, estimator_class
                ]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == [Imputer
                                                    ] + estimator_components
            else:
                assert pipeline.component_graph == [
                    Imputer, DateTimeFeaturizer
                ] + delayed_features + estimator_components
예제 #2
0
def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression,
              helper_functions):

    if problem_type not in estimator.supported_problem_types:
        pytest.skip("Skipping because estimator and pipeline are not compatible.")

    if problem_type == ProblemTypes.BINARY:
        training_data, y = X_y_binary
        is_binary = True
    elif problem_type == ProblemTypes.MULTICLASS:
        training_data, y = X_y_multi
        is_binary = False
    else:
        training_data, y = X_y_regression
    try:
        pipeline = make_pipeline(training_data, y, estimator, problem_type, parameters={estimator.name: {'n_jobs': 1}})
    except ValueError:
        pipeline = make_pipeline(training_data, y, estimator, problem_type)

    shap_values = calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain)

    if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
        assert isinstance(shap_values, list), "For binary classification, returned values must be a list"
        assert all(isinstance(class_values, dict) for class_values in shap_values), "Not all list elements are lists!"
        if is_binary:
            assert len(shap_values) == N_CLASSES_BINARY, "A dictionary should be returned for each class!"
        else:
            assert len(shap_values) == N_CLASSES_MULTICLASS, "A dictionary should be returned for each class!"
        assert all(
            len(values) == N_FEATURES for values in shap_values), "A SHAP value must be computed for every feature!"
        for class_values in shap_values:
            assert all(isinstance(feature, list) for feature in
                       class_values.values()), "Every value in the dict must be a list!"
            assert all(len(v) == n_points_to_explain for v in
                       class_values.values()), "A SHAP value must be computed for every data point to explain!"
    elif problem_type == ProblemTypes.REGRESSION:
        assert isinstance(shap_values, dict), "For regression, returned values must be a dictionary!"
        assert len(shap_values) == N_FEATURES, "A SHAP value should be computed for every feature!"
        assert all(isinstance(feature, list) for feature in shap_values.values()), "Every value in the dict must be a list!"
        assert all(len(v) == n_points_to_explain for v in
                   shap_values.values()), "A SHAP value must be computed for every data point to explain!"
예제 #3
0
def test_automl_pickle_generated_pipeline(mock_regression_fit,
                                          mock_regression_score,
                                          X_y_regression):
    mock_regression_score.return_value = {"R2": 1.0}

    class RegressionPipelineCustom(RegressionPipeline):
        custom_name = "Custom Regression Name"
        component_graph = ["Imputer", "Linear Regressor"]
        custom_hyperparameters = {
            "Imputer": {
                "numeric_impute_strategy": "most_frequent"
            }
        }

    X, y = X_y_regression
    pipeline = GeneratedPipelineRegression

    allowed_estimators = get_estimators('regression')
    allowed_pipelines = [
        make_pipeline(X, y, estimator, problem_type='regression')
        for estimator in allowed_estimators
    ]
    allowed_pipelines.append(RegressionPipelineCustom)
    a = AutoMLSearch(X_train=X,
                     y_train=y,
                     problem_type='regression',
                     allowed_pipelines=allowed_pipelines)
    a.search()
    a.add_to_rankings(RegressionPipelineCustom({}))
    seen_name = False
    for i, row in a.rankings.iterrows():
        automl_pipeline = a.get_pipeline(row['id'])
        assert automl_pipeline.__class__ == pipeline
        assert pickle.loads(pickle.dumps(automl_pipeline))
        if automl_pipeline.custom_name == RegressionPipelineCustom.custom_name:
            seen_name = True
            assert automl_pipeline.custom_hyperparameters == RegressionPipelineCustom.custom_hyperparameters
            assert automl_pipeline.component_graph == RegressionPipelineCustom.component_graph
    assert seen_name