示例#1
0
    def test_submit_scoring_job_single(self):
        """ Test that scoring a single pipeline using the parallel engine produces the
        same results as simply running the score_pipeline function. """
        X, y = self.X_y_binary
        pipeline = TestLRCPipeline(
            {"Logistic Regression Classifier": {
                "n_jobs": 1
            }})
        engine = DaskEngine(client=self.client)
        objectives = [automl_data.objective]

        pipeline_future = engine.submit_training_job(X=ww.DataTable(X),
                                                     y=ww.DataColumn(y),
                                                     automl_config=automl_data,
                                                     pipeline=pipeline)
        pipeline = pipeline_future.get_result()
        pipeline_score_future = engine.submit_scoring_job(
            X=ww.DataTable(X),
            y=ww.DataColumn(y),
            automl_config=automl_data,
            pipeline=pipeline,
            objectives=objectives)
        assert isinstance(pipeline_score_future, DaskComputation)
        pipeline_score = pipeline_score_future.get_result()

        original_pipeline_score = pipeline.score(X=X,
                                                 y=y,
                                                 objectives=objectives)

        assert not np.isnan(pipeline_score["Log Loss Binary"])
        assert pipeline_score == original_pipeline_score
示例#2
0
def testinfer_feature_types():
    X_dt = ww.DataTable(pd.DataFrame([[1, 2], [3, 4]]))
    pd.testing.assert_frame_equal(X_dt.to_dataframe(), infer_feature_types(X_dt).to_dataframe())

    X_dc = ww.DataColumn(pd.Series([1, 2, 3, 4]))
    pd.testing.assert_series_equal(X_dc.to_series(), infer_feature_types(X_dc).to_series())

    X_pd = pd.DataFrame({0: pd.Series([1, 2], dtype="Int64"),
                         1: pd.Series([3, 4], dtype="Int64")})
    pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd).to_dataframe())

    X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
    pd.testing.assert_series_equal(X_pd, infer_feature_types(X_pd).to_series())

    X_list = [1, 2, 3, 4]
    X_expected = ww.DataColumn(pd.Series(X_list))
    pd.testing.assert_series_equal(X_expected.to_series(), infer_feature_types(X_list).to_series())
    assert X_list == [1, 2, 3, 4]

    X_np = np.array([1, 2, 3, 4])
    X_expected = ww.DataColumn(pd.Series(X_np))
    pd.testing.assert_series_equal(X_expected.to_series(), infer_feature_types(X_np).to_series())
    assert np.array_equal(X_np, np.array([1, 2, 3, 4]))

    X_np = np.array([[1, 2], [3, 4]])
    X_expected = ww.DataTable(pd.DataFrame(X_np))
    pd.testing.assert_frame_equal(X_expected.to_dataframe(), infer_feature_types(X_np).to_dataframe())
    assert np.array_equal(X_np, np.array([[1, 2], [3, 4]]))
示例#3
0
def test_class_imbalance_nonnumeric_balanced(input_type):
    X = pd.DataFrame()
    y_bools_balanced = pd.Series([True, True, True, False, False])
    y_binary_balanced = pd.Series(["No", "Yes", "No", "Yes"])
    y_multiclass_balanced = pd.Series([
        "red", "green", "red", "red", "blue", "green", "red", "blue", "green",
        "red"
    ])
    if input_type == "ww":
        X = ww.DataTable(X)
        y_bools_balanced = ww.DataColumn(y_bools_balanced)
        y_binary_balanced = ww.DataColumn(y_binary_balanced)
        y_multiclass_balanced = ww.DataColumn(y_multiclass_balanced)

    class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1)
    assert class_imbalance_check.validate(X, y_multiclass_balanced) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
    assert class_imbalance_check.validate(X, y_binary_balanced) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
    assert class_imbalance_check.validate(X, y_multiclass_balanced) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
示例#4
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({
        'lots_of_null': [None, None, None, None, "some data"],
        'all_null': [None, None, None, None, None],
        'also_all_null': [None, None, None, None, None],
        'no_null': [1, 2, 3, 5, 5],
        'id': [0, 1, 2, 3, 4],
        'has_label_leakage': [100, 200, 100, 200, 100]
    })
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [
        DataCheckWarning(
            message=
            "Column 'lots_of_null' is 95.0% or more correlated with the target",
            data_check_name="TargetLeakageDataCheck",
            message_code=DataCheckMessageCode.TARGET_LEAKAGE,
            details={
                "column": "lots_of_null"
            }).to_dict()
    ]
    data_checks = DefaultDataChecks(
        "regression", get_default_primary_search_objective("regression"))
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings":
        messages[:3] + null_leakage,
        "errors":
        messages[4:] + [
            DataCheckError(message="Y has 1 unique value.",
                           data_check_name="NoVarianceDataCheck",
                           message_code=DataCheckMessageCode.NO_VARIANCE,
                           details={
                               "column": "Y"
                           }).to_dict()
        ]
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "regression",
                "objective": get_default_primary_search_objective("regression")
            }
        })
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }
示例#5
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"],
                      'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None],
                      'no_null': [1, 2, 3, 5, 5],
                      'id': [0, 1, 2, 3, 4],
                      'has_label_leakage': [100, 200, 100, 200, 100],
                      'natural_language_nan': [None,
                                               "string_that_is_long_enough_for_natural_language_1",
                                               "string_that_is_long_enough_for_natural_language_2",
                                               "string_that_is_long_enough_for_natural_language_3",
                                               "string_that_is_long_enough_for_natural_language_4"],
                      'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))})
    X['nan_dt_col'][0] = None
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target",
                                     data_check_name="TargetLeakageDataCheck",
                                     message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                     details={"column": "lots_of_null"}).to_dict()]
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target",
                                           data_check_name="TargetLeakageDataCheck",
                                           message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                           details={"column": "id"}).to_dict()]
    nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target",
                                               data_check_name="TargetLeakageDataCheck",
                                               message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                               details={"column": "nan_dt_col"}).to_dict()]

    impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict()
    nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict()
    expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:]
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings": messages[:3] + null_leakage,
        "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.",
                                                  data_check_name="NoVarianceDataCheck",
                                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                                  details={"column": "Y"}).to_dict()] + messages[7:],
        "actions": expected_actions[:3] + expected_actions[4:]
    }

    data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES,
                             {"InvalidTargetDataCheck": {"problem_type": "regression",
                                                         "objective": get_default_primary_search_objective("regression")}})
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}
示例#6
0
 def score_pipelines(pipelines, engine):
     futures = []
     for pipeline in pipelines:
         futures.append(engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                   automl_config=automl_data, pipeline=pipeline))
     pipelines = [f.get_result() for f in futures]
     futures = []
     for pipeline in pipelines:
         futures.append(engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                  automl_config=automl_data, pipeline=pipeline,
                                                  objectives=[automl_data.objective]))
     results = [f.get_result() for f in futures]
     return results
示例#7
0
def test_compute_final_component_features_nonlinear(mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, example_graph, X_y_binary):
    X, y = X_y_binary
    mock_imputer.return_value = ww.DataTable(pd.DataFrame(X))
    mock_ohe.return_value = ww.DataTable(pd.DataFrame(X))
    mock_en_predict.return_value = ww.DataColumn(pd.Series(np.ones(X.shape[0])))
    mock_rf_predict.return_value = ww.DataColumn(pd.Series(np.zeros(X.shape[0])))
    X_expected = pd.DataFrame({'Random Forest': np.zeros(X.shape[0]), 'Elastic Net': np.ones(X.shape[0])})
    component_graph = ComponentGraph(example_graph).instantiate({})
    component_graph.fit(X, y)

    X_t = component_graph.compute_final_component_features(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
    assert mock_imputer.call_count == 2
    assert mock_ohe.call_count == 4
示例#8
0
def test_make_pipeline_text_columns(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "categorical": ["a", "b", "a", "c", "c"],
                      "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
示例#9
0
 def __getitem__(self, key):
     selection = self.underlying_data.iloc[key]
     if isinstance(selection,
                   pd.Series) or (ks and isinstance(selection, ks.Series)):
         col_name = selection.name
         if isinstance(self.ww_data, ww.DataTable) and set(
                 selection.index.values) == set(self.ww_data.columns):
             # return selection as series if series of one row.
             return selection
         if isinstance(self.ww_data, ww.DataTable):
             logical_type = self.ww_data.logical_types.get(col_name, None)
             semantic_tags = self.ww_data.semantic_tags.get(col_name, None)
         else:
             logical_type = self.ww_data.logical_type or None
             semantic_tags = self.ww_data.semantic_tags or None
         if semantic_tags is not None:
             semantic_tags = semantic_tags - {'index'} - {'time_index'}
         name = self.ww_data.name
         return ww.DataColumn(
             selection,
             logical_type=logical_type,
             semantic_tags=semantic_tags,
             use_standard_tags=self.ww_data.use_standard_tags,
             name=name)
     elif isinstance(selection, pd.DataFrame) or (ks and isinstance(
             selection, ks.DataFrame)):
         return _new_dt_including(self.ww_data, selection)
     else:
         # singular value
         return selection
示例#10
0
def test_make_pipeline_only_text_columns(input_type, problem_type):
    X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"],
                      "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            standard_scaler = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                standard_scaler = [StandardScaler]
            assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
示例#11
0
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type):
    # testing that all_null column is not considered categorical
    X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
                      "num": [1, 2, 3, 4, 5]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
示例#12
0
    def _imbalanced_data_X_y(problem_type, categorical_columns, size):
        """"Generates a dummy classification dataset with particular amounts of class imbalance and categorical input columns.
        For our targets, we maintain a 1:5, or 0.2, class ratio of minority : majority.
        We only generate minimum amount for X to set the logical_types, so the length of X and y will be different.

        Arguments:
            problem_type (str): Either 'binary' or 'multiclass'
            categorical_columns (str): Determines how many categorical cols to use. Either 'all', 'some', or 'none'.
            size (str): Either 'large' or 'small'. 'large' returns a dataset of size 21,000, while 'small' returns a size of 4200
        """
        multiplier = 5 if size == 'large' else 1
        col_names = [f"col_{i}" for i in range(100)]
        # generate X to be all int values
        X_dict = {col_name: [i % (j + 1) for i in range(1, 100)] for j, col_name in enumerate(col_names)}
        X = pd.DataFrame(X_dict)
        if categorical_columns == 'all':
            X_ww = ww.DataTable(X, logical_types={col_name: "Categorical" for col_name in col_names})
        elif categorical_columns == 'some':
            X_ww = ww.DataTable(X, logical_types={col_name: "Categorical" for col_name in col_names[: len(col_names) // 2]})
        else:
            X_ww = ww.DataTable(X)
        if problem_type == 'binary':
            targets = [0] * 3500 + [1] * 700
        else:
            targets = [0] * 3000 + [1] * 600 + [2] * 600
        targets *= multiplier
        y_ww = ww.DataColumn(pd.Series(targets))
        return X_ww, y_ww
示例#13
0
def test_delay_feature_transformer_supports_custom_index(encode_X_as_str, encode_y_as_str, use_woodwork,
                                                         delayed_features_data):
    X, y = delayed_features_data

    X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str)

    X.index = pd.RangeIndex(50, 81)
    X_answer.index = pd.RangeIndex(50, 81)
    y.index = pd.RangeIndex(50, 81)
    y_answer.index = pd.RangeIndex(50, 81)

    answer = pd.DataFrame({"feature": X.feature,
                           "feature_delay_1": X_answer.feature.shift(1),
                           "feature_delay_2": X_answer.feature.shift(2),
                           "feature_delay_3": X_answer.feature.shift(3),
                           "target_delay_0": y_answer,
                           "target_delay_1": y_answer.shift(1),
                           "target_delay_2": y_answer.shift(2),
                           "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81))

    if use_woodwork:
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y), answer)

    answer_only_y = pd.DataFrame({"target_delay_0": y_answer,
                                  "target_delay_1": y_answer.shift(1),
                                  "target_delay_2": y_answer.shift(2),
                                  "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81))
    pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y),
                                  answer_only_y)
示例#14
0
def test_target_imputer_woodwork_custom_overrides_returned_by_components(
        y_pd, has_nan, impute_strategy):
    y_to_use = y_pd.copy()
    if has_nan:
        y_to_use[len(y_pd) - 1] = np.nan
    override_types = [Integer, Double, Categorical, Boolean]
    for logical_type in override_types:
        try:
            y = ww.DataColumn(y_to_use.copy(), logical_type=logical_type)
        except TypeError:
            continue

        impute_strategy_to_use = impute_strategy
        if logical_type in [Categorical, NaturalLanguage]:
            impute_strategy_to_use = "most_frequent"

        imputer = TargetImputer(impute_strategy=impute_strategy_to_use)
        imputer.fit(None, y)
        _, y_t = imputer.transform(None, y)
        assert isinstance(y_t, ww.DataColumn)

        if impute_strategy_to_use == "most_frequent" or not has_nan:
            assert y_t.logical_type == logical_type
        else:
            assert y_t.logical_type == Double
示例#15
0
def test_make_pipeline_no_column_names(input_type, problem_type):
    X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]])
    y = pd.Series([0, 0, 1])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
示例#16
0
 def eval_pipelines(pipelines, engine):
     futures = []
     for pipeline in pipelines:
         futures.append(engine.submit_evaluation_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                     automl_config=automl_data, pipeline=pipeline))
     results = [f.get_result() for f in futures]
     return results
示例#17
0
def test_explain_predictions_best_worst_custom_metric(mock_make_table,
                                                      output_format, answer):

    mock_make_table.return_value = "table goes here" if output_format == "text" else {
        "explanations": ["explanation_dictionary_goes_here"]
    }
    pipeline = MagicMock()
    pipeline.parameters = "Parameters go here"
    input_features = pd.DataFrame({"a": [5, 6]})
    pipeline.problem_type = ProblemTypes.REGRESSION
    pipeline.name = "Test Pipeline Name"
    pipeline.compute_estimator_features.return_value = ww.DataTable(
        input_features)

    pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1]))
    y_true = pd.Series([3, 2])

    def sum(y_true, y_pred):
        return y_pred + y_true

    best_worst_report = explain_predictions_best_worst(
        pipeline,
        input_features,
        y_true=y_true,
        num_to_explain=1,
        metric=sum,
        output_format=output_format)

    if output_format == "text":
        compare_two_tables(best_worst_report.splitlines(),
                           regression_custom_metric_answer.splitlines())
    else:
        assert best_worst_report == answer
示例#18
0
def test_make_pipeline_datetime_no_categorical(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
def test_predict_repeat_estimator(mock_predict, mock_fit, X_y_binary):
    X, y = X_y_binary
    mock_predict.return_value = ww.DataColumn(pd.Series(y))

    graph = {
        'Imputer': [Imputer],
        'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'],
        'OneHot_Logistic': [OneHotEncoder, 'Imputer.x'],
        'Random Forest': [RandomForestClassifier, 'OneHot_RandomForest.x'],
        'Logistic Regression':
        [LogisticRegressionClassifier, 'OneHot_Logistic.x'],
        'Final Estimator':
        [LogisticRegressionClassifier, 'Random Forest', 'Logistic Regression']
    }
    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    component_graph.fit(X, y)

    assert not component_graph.get_component(
        'Logistic Regression')._component_obj == component_graph.get_component(
            'Final Estimator')._component_obj

    component_graph.predict(X)
    assert mock_predict.call_count == 5
    assert mock_fit.call_count == 3
示例#20
0
def test_partial_dependence_multiclass_categorical(class_label,
                                                   logistic_regression_multiclass_pipeline_class):
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')

    X, y = load_wine()
    X['categorical_column'] = ww.DataColumn(pd.Series([i % 3 for i in range(X.shape[0])]).astype(str),
                                            logical_type="Categorical")
    X['categorical_column_2'] = ww.DataColumn(pd.Series([i % 6 for i in range(X.shape[0])]).astype(str),
                                              logical_type="Categorical")

    pipeline = logistic_regression_multiclass_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}})

    pipeline.fit(X, y)

    fig = graph_partial_dependence(pipeline, X, features='categorical_column', class_label=class_label,
                                   grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'bar'
        assert plot_data['x'].tolist() == ['0', '1', '2']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label

    fig = graph_partial_dependence(pipeline, X, features=('alcohol', 'categorical_column'), class_label=class_label,
                                   grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'contour'
        assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label

    fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'),
                                   class_label=class_label, grid_resolution=5)

    for i, plot_data in enumerate(fig.to_dict()['data']):
        assert plot_data['type'] == 'contour'
        assert fig.to_dict()['layout']['xaxis']['ticktext'] == ['0', '1', '2']
        assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5']
        if class_label is None:
            assert plot_data['name'] == f'class_{i}'
        else:
            assert plot_data['name'] == class_label
def test_predict(mock_predict, mock_fit, example_graph, X_y_binary):
    X, y = X_y_binary
    mock_predict.return_value = ww.DataColumn(pd.Series(y))
    component_graph = ComponentGraph(example_graph).instantiate({})
    component_graph.fit(X, y)

    component_graph.predict(X)
    assert mock_predict.call_count == 5  # Called twice when fitting pipeline, thrice when predicting
    assert mock_fit.call_count == 3  # Only called during fit, not predict
示例#22
0
def test_classification_pipeline_encodes_targets(mock_encode, mock_decode,
                                                 mock_score, mock_predict,
                                                 mock_predict_proba, mock_fit,
                                                 pipeline_class, X_y_binary):
    X, y = X_y_binary
    y_series = pd.Series(y)
    mock_predict.return_value = ww.DataColumn(y_series)
    mock_predict_proba.return_value = ww.DataTable(
        pd.DataFrame({
            "negative": y_series,
            "positive": y_series
        }))
    X = pd.DataFrame({"feature": range(len(y))})
    y_encoded = y_series.map(lambda label: "positive"
                             if label == 1 else "negative")

    mock_encode.return_value = y_series
    mock_decode.return_value = y_encoded

    class MyTsPipeline(pipeline_class):
        component_graph = [
            'Delayed Feature Transformer', 'Logistic Regression Classifier'
        ]

    pl = MyTsPipeline({
        "Delayed Feature Transformer": {
            "gap": 0,
            "max_delay": 1
        },
        "pipeline": {
            "gap": 0,
            "max_delay": 1
        }
    })

    # Check fit encodes target
    pl.fit(X, y_encoded)
    _, target_passed_to_estimator = mock_fit.call_args[0]

    # Check that target is converted to ints. Use .iloc[1:] because the first feature row has NaNs
    assert_series_equal(target_passed_to_estimator, y_series.iloc[1:])

    # Check predict encodes target
    mock_encode.reset_mock()
    pl.predict(X, y_encoded)
    mock_encode.assert_called_once()

    # Check predict proba encodes target
    mock_encode.reset_mock()
    pl.predict_proba(X, y_encoded)
    mock_encode.assert_called_once()

    # Check score encodes target
    mock_encode.reset_mock()
    pl.score(X, y_encoded, objectives=['MCC Binary'])
    mock_encode.assert_called_once()
def test_make_pipeline_no_nulls(input_type, problem_type):
    X = pd.DataFrame({
        "numerical": [1, 2, 3, 1, 2],
        "categorical": ["a", "b", "a", "c", "c"],
        "some dates":
        pd.date_range('2000-02-03', periods=5, freq='W')
    })
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [
                    OneHotEncoder, StandardScaler, estimator_class
                ]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == [Imputer
                                                    ] + estimator_components
            else:
                assert pipeline.component_graph == [
                    Imputer, DateTimeFeaturizer
                ] + delayed_features + estimator_components
示例#24
0
def test_empty_data_checks(input_type, X_y_binary):
    X, y = X_y_binary
    if input_type != "np":
        X = pd.DataFrame(X)
        y = pd.Series(y)
    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    data_checks = EmptyDataChecks()
    assert data_checks.validate(X, y) == {"warnings": [], "errors": []}
示例#25
0
def test_iLocIndexer_class_error(sample_df_dask, sample_series_dask):
    dt_dask = ww.DataTable(sample_df_dask)
    with pytest.raises(TypeError,
                       match="iloc is not supported for Dask DataTables"):
        _iLocIndexer(dt_dask)

    dc_dask = ww.DataColumn(sample_series_dask)
    with pytest.raises(TypeError,
                       match="iloc is not supported for Dask DataColumns"):
        _iLocIndexer(dc_dask)
示例#26
0
def test_class_imbalance_severe(min_samples, input_type):
    X = pd.DataFrame()
    # 0 will be < 10% of the data, but there will be 50 samples of it
    y_values_binary = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] * 50)
    y_values_multiclass = pd.Series([0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2] *
                                    50)
    if input_type == "ww":
        X = ww.DataTable(X)
        y_values_binary = ww.DataColumn(y_values_binary)
        y_values_multiclass = ww.DataColumn(y_values_multiclass)

    class_imbalance_check = ClassImbalanceDataCheck(min_samples=min_samples,
                                                    num_cv_folds=1)
    warnings = [
        DataCheckWarning(
            message="The following labels fall below 10% of the target: [0]",
            data_check_name=class_imbalance_data_check_name,
            message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD,
            details={
                "target_values": [0]
            }).to_dict()
    ]
    if min_samples > 50:
        warnings.append(
            DataCheckWarning(
                message=
                f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: [0]",
                data_check_name=class_imbalance_data_check_name,
                message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE,
                details={
                    "target_values": [0]
                }).to_dict())
    assert class_imbalance_check.validate(X, y_values_binary) == {
        "warnings": warnings,
        "errors": [],
        "actions": []
    }

    assert class_imbalance_check.validate(X, y_values_multiclass) == {
        "warnings": warnings,
        "errors": [],
        "actions": []
    }
示例#27
0
def test_fit(mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_binary):
    X, y = X_y_binary
    mock_fit_transform.return_value = ww.DataTable(X)
    mock_predict.return_value = ww.DataColumn(y)
    component_graph = ComponentGraph(example_graph).instantiate({})
    component_graph.fit(X, y)

    assert mock_fit_transform.call_count == 3
    assert mock_fit.call_count == 3
    assert mock_predict.call_count == 2
示例#28
0
def test_target_leakage_multi():
    leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8)

    # test empty pd.DataFrame, empty pd.Series
    assert leakage_check.validate(pd.DataFrame(), pd.Series()) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }

    y = pd.Series([1, 0, 2, 1, 2, 0])
    X = pd.DataFrame()
    X["a"] = y * 3
    X["b"] = y - 1
    X["c"] = y / 10
    X["d"] = [0, 0, 0, 0, 0, 0]
    X["e"] = ["a", "b", "c", "a", "b", "c"]

    expected_messages = {
        "warnings": [
            DataCheckWarning(
                message=
                "Column 'a' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "a"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'b' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "b"
                }).to_dict(),
            DataCheckWarning(
                message=
                "Column 'c' is 80.0% or more correlated with the target",
                data_check_name=target_leakage_data_check_name,
                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                details={
                    "column": "c"
                }).to_dict()
        ],
        "errors": [],
        "actions": []
    }

    # test X as ww.DataTable, y as ww.DataColumn
    assert leakage_check.validate(ww.DataTable(X),
                                  ww.DataColumn(y)) == expected_messages

    #  test y as list
    assert leakage_check.validate(X, y.values) == expected_messages
示例#29
0
def test_component_graph_evaluation_plumbing(mock_transa, mock_transb, mock_transc, mock_preda, mock_predb, mock_predc, dummy_components):
    TransformerA, TransformerB, TransformerC, EstimatorA, EstimatorB, EstimatorC = dummy_components
    mock_transa.return_value = ww.DataTable(pd.DataFrame({'feature trans': [1, 0, 0, 0, 0, 0], 'feature a': np.ones(6)}))
    mock_transb.return_value = ww.DataTable(pd.DataFrame({'feature b': np.ones(6) * 2}))
    mock_transc.return_value = ww.DataTable(pd.DataFrame({'feature c': np.ones(6) * 3}))
    mock_preda.return_value = ww.DataColumn(pd.Series([0, 0, 0, 1, 0, 0]))
    mock_predb.return_value = ww.DataColumn(pd.Series([0, 0, 0, 0, 1, 0]))
    mock_predc.return_value = ww.DataColumn(pd.Series([0, 0, 0, 0, 0, 1]))
    graph = {
        'transformer a': [TransformerA],
        'transformer b': [TransformerB, 'transformer a'],
        'transformer c': [TransformerC, 'transformer a', 'transformer b'],
        'estimator a': [EstimatorA],
        'estimator b': [EstimatorB, 'transformer a'],
        'estimator c': [EstimatorC, 'transformer a', 'estimator a', 'transformer b', 'estimator b', 'transformer c']
    }
    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    X = pd.DataFrame({'feature1': np.zeros(6), 'feature2': np.zeros(6)})
    y = pd.Series(np.zeros(6))
    component_graph.fit(X, y)
    predict_out = component_graph.predict(X)

    assert_frame_equal(mock_transa.call_args[0][0].to_dataframe(), X)
    assert_frame_equal(mock_transb.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                 'feature a': np.ones(6)}, columns=['feature trans', 'feature a']))
    assert_frame_equal(mock_transc.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                 'feature a': np.ones(6),
                                                                                 'feature b': np.ones(6) * 2},
                                                                                columns=['feature trans', 'feature a', 'feature b']))
    assert_frame_equal(mock_preda.call_args[0][0].to_dataframe(), X)
    assert_frame_equal(mock_predb.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                'feature a': np.ones(6)},
                                                                               columns=['feature trans', 'feature a']))
    assert_frame_equal(mock_predc.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                'feature a': np.ones(6),
                                                                                'estimator a': pd.Series([0, 0, 0, 1, 0, 0], dtype="Int64"),
                                                                                'feature b': np.ones(6) * 2,
                                                                                'estimator b': pd.Series([0, 0, 0, 0, 1, 0], dtype="Int64"),
                                                                                'feature c': np.ones(6) * 3},
                                                                               columns=['feature trans', 'feature a', 'estimator a', 'feature b', 'estimator b', 'feature c']))
    assert_series_equal(pd.Series([0, 0, 0, 0, 0, 1], dtype="Int64"), predict_out.to_series())
示例#30
0
def test_imputer_all_bool_return_original(data_type):
    X = pd.DataFrame([True, True, False, True, True], dtype=bool)
    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype=bool)
    y = pd.Series([1, 0, 0, 1, 0])
    if data_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    imputer = Imputer()
    imputer.fit(X, y)
    X_t = imputer.transform(X)
    assert_frame_equal(X_expected_arr, X_t)