示例#1
0
def test_imputer_fill_value(imputer_test_data):
    X = imputer_test_data[[
        "int with nan", "categorical with nan", "float with nan",
        "object with nan", "bool col with nan"
    ]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(categorical_impute_strategy="constant",
                      numeric_impute_strategy="constant",
                      categorical_fill_value="fill",
                      numeric_fill_value=-1)
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int with nan": [-1, 1, 0, 0, 1],
        "categorical with nan":
        pd.Series(["fill", "1", "fill", "0", "3"], dtype='category'),
        "float with nan": [0.0, 1.0, -1, -1.0, 0.],
        "object with nan":
        pd.Series(["b", "b", "fill", "c", "fill"], dtype='category'),
        "bool col with nan":
        pd.Series([True, "fill", False, "fill", True], dtype='category')
    })
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)

    imputer = Imputer(categorical_impute_strategy="constant",
                      numeric_impute_strategy="constant",
                      categorical_fill_value="fill",
                      numeric_fill_value=-1)
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(expected, transformed.to_dataframe(), check_dtype=False)
示例#2
0
def test_imputer_with_none():
    X = pd.DataFrame({
        "int with None": [1, 0, 5, None],
        "float with None": [0.1, 0.0, 0.5, None],
        "category with None":
        pd.Series(["b", "a", "a", None], dtype='category'),
        "boolean with None": [True, None, False, True],
        "object with None": ["b", "a", "a", None],
        "all None": [None, None, None, None]
    })
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int with None": [1, 0, 5, 2],
        "float with None": [0.1, 0.0, 0.5, 0.2],
        "category with None":
        pd.Series(["b", "a", "a", "a"], dtype='category'),
        "boolean with None": [True, True, False, True],
        "object with None":
        pd.Series(["b", "a", "a", "a"], dtype='category')
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
示例#3
0
def test_categorical_only_input(imputer_test_data):
    X = imputer_test_data[[
        "categorical col", "object col", "bool col", "categorical with nan",
        "object with nan", "bool col with nan", "all nan cat"
    ]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col":
        pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "object col":
        pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "bool col": [True, False, False, True, True],
        "categorical with nan":
        pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
        "object with nan":
        pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
        "bool col with nan": [True, True, False, True, True]
    })

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
示例#4
0
def test_categorical_and_numeric_input(imputer_test_data):
    X = imputer_test_data
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col":
        pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "int col": [0, 1, 2, 0, 3],
        "object col":
        pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
        "bool col": [True, False, False, True, True],
        "categorical with nan":
        pd.Series(["0", "1", "0", "0", "3"], dtype='category'),
        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
        "float with nan": [0.0, 1.0, 0, -1.0, 0.],
        "object with nan":
        pd.Series(["b", "b", "b", "c", "b"], dtype='category'),
        "bool col with nan": [True, True, False, True, True]
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
def test_invalid_init():
    invalid_graph = {'Imputer': [Imputer], 'OHE': OneHotEncoder}
    with pytest.raises(
            ValueError,
            match='All component information should be passed in as a list'):
        ComponentGraph(invalid_graph)

    with pytest.raises(
            ValueError,
            match='may only contain str or ComponentBase subclasses'):
        ComponentGraph({
            'Imputer': [Imputer(numeric_impute_strategy="most_frequent")],
            'OneHot': [OneHotEncoder]
        })

    graph = {
        'Imputer':
        [Imputer(numeric_impute_strategy='constant', numeric_fill_value=0)]
    }
    with pytest.raises(
            ValueError,
            match='may only contain str or ComponentBase subclasses'):
        ComponentGraph(graph)

    graph = {
        'Imputer': ['Imputer', 'Fake'],
        'Fake': ['Fake Component', 'Estimator'],
        'Estimator': [ElasticNetClassifier]
    }
    with pytest.raises(MissingComponentError):
        ComponentGraph(graph)
示例#6
0
def test_drop_all_columns(imputer_test_data):
    X = imputer_test_data[["all nan cat", "all nan"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = X.drop(["all nan cat", "all nan"], axis=1)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
示例#7
0
def test_typed_imputer_numpy_input():
    X = np.array([[1, 2, 2, 0], [np.nan, 0, 0, 0], [1, np.nan, np.nan,
                                                    np.nan]])
    y = pd.Series([0, 0, 1])
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame(
        np.array([[1, 2, 2, 0], [1, 0, 0, 0], [1, 1, 1, 0]]))
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
示例#8
0
def test_imputer_empty_data(data_type, make_data_type):
    X = pd.DataFrame()
    y = pd.Series()
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)
    expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
示例#9
0
def test_imputer_datetime_input():
    X = pd.DataFrame({'dates': ['20190902', '20200519', '20190607', np.nan],
                      'more dates': ['20190902', '20201010', '20190921', np.nan]})
    X['dates'] = pd.to_datetime(X['dates'], format='%Y%m%d')
    X['more dates'] = pd.to_datetime(X['more dates'], format='%Y%m%d')
    y = pd.Series()

    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), X, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), X, check_dtype=False)
示例#10
0
def test_imputer_all_bool_return_original(data_type, make_data_type):
    X = make_data_type(data_type, pd.DataFrame([True, True, False, True, True], dtype=bool))
    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='boolean')
    y = make_data_type(data_type, pd.Series([1, 0, 0, 1, 0]))

    imputer = Imputer()
    imputer.fit(X, y)
    X_t = imputer.transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe())
示例#11
0
def test_imputer_no_nans(imputer_test_data):
    X = imputer_test_data[["categorical col", "object col", "bool col"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
                      categorical_fill_value="fill", numeric_fill_value=-1)
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
        "object col": pd.Series(["b", "b", "a", "c", "d"], dtype='category'),
        "bool col": [True, False, False, True, True],
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer(categorical_impute_strategy="constant", numeric_impute_strategy="constant",
                      categorical_fill_value="fill", numeric_fill_value=-1)
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
示例#12
0
def test_numeric_only_input(imputer_test_data):
    X = imputer_test_data[["int col", "float col",
                           "int with nan", "float with nan", "all nan"]]
    y = pd.Series([0, 0, 1, 0, 1])
    imputer = Imputer(numeric_impute_strategy="median")
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    expected = pd.DataFrame({
        "int col": [0, 1, 2, 0, 3],
        "float col": [0.0, 1.0, 0.0, -2.0, 5.],
        "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
        "float with nan": [0.0, 1.0, 0, -1.0, 0.]
    })
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed.to_dataframe(), expected, check_dtype=False)
示例#13
0
def test_imputer_default_parameters():
    imputer = Imputer()
    expected_parameters = {
        'categorical_impute_strategy': 'most_frequent',
        'numeric_impute_strategy': 'mean',
        'categorical_fill_value': None,
        'numeric_fill_value': None
    }
    assert imputer.parameters == expected_parameters
示例#14
0
def test_imputer_bool_dtype_object(data_type, make_data_type):
    X = pd.DataFrame([True, np.nan, False, np.nan, True], dtype='boolean')
    y = pd.Series([1, 0, 0, 1, 0])
    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype='boolean')
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)
    imputer = Imputer()
    imputer.fit(X, y)
    X_t = imputer.transform(X)
    assert_frame_equal(X_expected_arr, X_t.to_dataframe())
示例#15
0
def test_iteration(example_graph):
    component_graph = ComponentGraph(example_graph)

    expected = [Imputer, OneHotEncoder, ElasticNetClassifier, OneHotEncoder, RandomForestClassifier, LogisticRegressionClassifier]
    iteration = [component for component in component_graph]
    assert iteration == expected

    component_graph.instantiate({'OneHot_RandomForest': {'top_n': 32}})
    expected = [Imputer(), OneHotEncoder(), ElasticNetClassifier(), OneHotEncoder(top_n=32), RandomForestClassifier(), LogisticRegressionClassifier()]
    iteration = [component for component in component_graph]
    assert iteration == expected
示例#16
0
def test_imputer_all_bool_return_original(data_type):
    X = pd.DataFrame([True, True, False, True, True], dtype=bool)
    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype=bool)
    y = pd.Series([1, 0, 0, 1, 0])
    if data_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    imputer = Imputer()
    imputer.fit(X, y)
    X_t = imputer.transform(X)
    assert_frame_equal(X_expected_arr, X_t)
示例#17
0
def test_imputer_empty_data(data_type):
    if data_type == 'pd':
        X = pd.DataFrame()
        y = pd.Series()
        expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    elif data_type == 'ww':
        X = ww.DataTable(pd.DataFrame())
        y = ww.DataColumn(pd.Series())
        expected = pd.DataFrame(index=pd.Index([]), columns=pd.Index([]))
    else:
        X = np.array([[]])
        y = np.array([])
        expected = pd.DataFrame(index=pd.Index([0]), columns=pd.Int64Index([]))
    imputer = Imputer()
    imputer.fit(X, y)
    transformed = imputer.transform(X, y)
    assert_frame_equal(transformed, expected, check_dtype=False)

    imputer = Imputer()
    transformed = imputer.fit_transform(X, y)
    assert_frame_equal(transformed, expected, check_dtype=False)
示例#18
0
def test_generate_code():
    expected_code = "from evalml.pipelines.components.estimators.classifiers.logistic_regression import LogisticRegressionClassifier" \
                    "\n\nlogisticRegressionClassifier = LogisticRegressionClassifier(**{'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'})"
    component_code = generate_component_code(LogisticRegressionClassifier())
    assert component_code == expected_code

    expected_code = "from evalml.pipelines.components.estimators.regressors.et_regressor import ExtraTreesRegressor" \
                    "\n\nextraTreesRegressor = ExtraTreesRegressor(**{'n_estimators': 50, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1})"
    component_code = generate_component_code(ExtraTreesRegressor(n_estimators=50))
    assert component_code == expected_code

    expected_code = "from evalml.pipelines.components.transformers.imputers.imputer import Imputer" \
                    "\n\nimputer = Imputer(**{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None})"
    component_code = generate_component_code(Imputer())
    assert component_code == expected_code
示例#19
0
def test_imputer_does_not_reset_index():
    X = pd.DataFrame({'input_val': np.arange(10), 'target': np.arange(10),
                      'input_cat': ['a'] * 7 + ['b'] * 3})
    X.loc[5, 'input_val'] = np.nan
    X.loc[5, 'input_cat'] = np.nan
    assert X.index.tolist() == list(range(10))

    X.drop(0, inplace=True)
    y = X.pop('target')

    imputer = Imputer()
    imputer.fit(X, y=y)
    transformed = imputer.transform(X)
    pd.testing.assert_frame_equal(transformed.to_dataframe(),
                                  pd.DataFrame({'input_val': [1.0, 2, 3, 4, 5, 6, 7, 8, 9],
                                                'input_cat': pd.Categorical(['a'] * 6 + ['b'] * 3)},
                                               index=list(range(1, 10))))
示例#20
0
def test_imputer_multitype_with_one_bool(data_type, make_data_type):
    X_multi = pd.DataFrame({
        "bool with nan": pd.Series([True, np.nan, False, np.nan, False], dtype='boolean'),
        "bool no nan": pd.Series([False, False, False, False, True], dtype=bool),
    })
    y = pd.Series([1, 0, 0, 1, 0])
    X_multi_expected_arr = pd.DataFrame({
        "bool with nan": pd.Series([True, False, False, False, False], dtype='boolean'),
        "bool no nan": pd.Series([False, False, False, False, True], dtype='boolean'),
    })

    X_multi = make_data_type(data_type, X_multi)
    y = make_data_type(data_type, y)

    imputer = Imputer()
    imputer.fit(X_multi, y)
    X_multi_t = imputer.transform(X_multi)
    assert_frame_equal(X_multi_expected_arr, X_multi_t.to_dataframe())
示例#21
0
def test_imputer_init(categorical_impute_strategy, numeric_impute_strategy):

    imputer = Imputer(categorical_impute_strategy=categorical_impute_strategy,
                      numeric_impute_strategy=numeric_impute_strategy,
                      categorical_fill_value="str_fill_value",
                      numeric_fill_value=-1)
    expected_parameters = {
        'categorical_impute_strategy': categorical_impute_strategy,
        'numeric_impute_strategy': numeric_impute_strategy,
        'categorical_fill_value': 'str_fill_value',
        'numeric_fill_value': -1
    }
    expected_hyperparameters = {
        "categorical_impute_strategy": ["most_frequent"],
        "numeric_impute_strategy": ["mean", "median", "most_frequent"]
    }
    assert imputer.name == "Imputer"
    assert imputer.parameters == expected_parameters
    assert imputer.hyperparameter_ranges == expected_hyperparameters
def test_generate_code_pipeline_json_with_objects():
    class CustomEstimator(Estimator):
        name = "My Custom Estimator"
        hyperparameter_ranges = {}
        supported_problem_types = [
            ProblemTypes.BINARY, ProblemTypes.MULTICLASS
        ]
        model_family = ModelFamily.NONE

        def __init__(self, random_arg=False, numpy_arg=[], random_seed=0):
            parameters = {'random_arg': random_arg, 'numpy_arg': numpy_arg}

            super().__init__(parameters=parameters,
                             component_obj=None,
                             random_seed=random_seed)

    component_graph = ['Imputer', CustomEstimator]
    pipeline = BinaryClassificationPipeline(
        component_graph,
        custom_name="Mock Binary Pipeline with Transformer",
        parameters={'My Custom Estimator': {
            'numpy_arg': np.array([0])
        }})
    generated_pipeline_code = generate_pipeline_code(pipeline)
    assert generated_pipeline_code == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \
        "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " \
        "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \
        "'My Custom Estimator':{'random_arg': False, 'numpy_arg': array([0])}}, custom_name='Mock Binary Pipeline with Transformer', random_seed=0)"

    pipeline = BinaryClassificationPipeline(
        component_graph,
        custom_name="Mock Binary Pipeline with Transformer",
        parameters={'My Custom Estimator': {
            'random_arg': Imputer()
        }})
    generated_pipeline_code = generate_pipeline_code(pipeline)
    assert generated_pipeline_code == "from evalml.pipelines.binary_classification_pipeline import BinaryClassificationPipeline\n" \
        "pipeline = BinaryClassificationPipeline(component_graph=['Imputer', CustomEstimator], " \
        "parameters={'Imputer':{'categorical_impute_strategy': 'most_frequent', 'numeric_impute_strategy': 'mean', 'categorical_fill_value': None, 'numeric_fill_value': None}, " \
        "'My Custom Estimator':{'random_arg': Imputer(categorical_impute_strategy='most_frequent', numeric_impute_strategy='mean', categorical_fill_value=None, numeric_fill_value=None), 'numpy_arg': []}}, " \
        "custom_name='Mock Binary Pipeline with Transformer', random_seed=0)"
示例#23
0
def test_imputer_multitype_with_one_bool(data_type):
    X_multi = pd.DataFrame({
        "bool with nan":
        pd.Series([True, np.nan, False, np.nan, False], dtype=object),
        "bool no nan":
        pd.Series([False, False, False, False, True], dtype=bool),
    })
    y = pd.Series([1, 0, 0, 1, 0])
    X_multi_expected_arr = pd.DataFrame({
        "bool with nan":
        pd.Series([True, False, False, False, False], dtype=object),
        "bool no nan":
        pd.Series([False, False, False, False, True], dtype=bool),
    })
    if data_type == 'ww':
        X_multi = ww.DataTable(X_multi)
        y = ww.DataColumn(y)
    imputer = Imputer()
    imputer.fit(X_multi, y)
    X_multi_t = imputer.transform(X_multi)
    assert_frame_equal(X_multi_expected_arr, X_multi_t)
示例#24
0
def test_imputer_woodwork_custom_overrides_returned_by_components(X_df, has_nan, numeric_impute_strategy):
    y = pd.Series([1, 2, 1])
    if has_nan:
        X_df.iloc[len(X_df) - 1, 0] = np.nan
    override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        imputer = Imputer(numeric_impute_strategy=numeric_impute_strategy)
        imputer.fit(X, y)
        transformed = imputer.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        if numeric_impute_strategy == "most_frequent":
            assert transformed.logical_types == {0: logical_type}
        elif logical_type in [Categorical, NaturalLanguage] or not has_nan:
            assert transformed.logical_types == {0: logical_type}
        else:
            assert transformed.logical_types == {0: Double}
def test_generate_code_pipeline_errors():
    class MockBinaryPipeline(BinaryClassificationPipeline):
        name = "Mock Binary Pipeline"
        component_graph = ['Imputer', 'Random Forest Classifier']

    class MockMulticlassPipeline(MulticlassClassificationPipeline):
        name = "Mock Multiclass Pipeline"
        component_graph = ['Imputer', 'Random Forest Classifier']

    class MockRegressionPipeline(RegressionPipeline):
        name = "Mock Regression Pipeline"
        component_graph = ['Imputer', 'Random Forest Regressor']

    with pytest.raises(ValueError,
                       match="Element must be a pipeline instance"):
        generate_pipeline_code(MockBinaryPipeline)

    with pytest.raises(ValueError,
                       match="Element must be a pipeline instance"):
        generate_pipeline_code(MockMulticlassPipeline)

    with pytest.raises(ValueError,
                       match="Element must be a pipeline instance"):
        generate_pipeline_code(MockRegressionPipeline)

    with pytest.raises(ValueError,
                       match="Element must be a pipeline instance"):
        generate_pipeline_code([Imputer])

    with pytest.raises(ValueError,
                       match="Element must be a pipeline instance"):
        generate_pipeline_code([Imputer, LogisticRegressionClassifier])

    with pytest.raises(ValueError,
                       match="Element must be a pipeline instance"):
        generate_pipeline_code([Imputer(), LogisticRegressionClassifier()])
示例#26
0
def test_describe_component():
    enc = OneHotEncoder()
    imputer = Imputer()
    simple_imputer = SimpleImputer("mean")
    column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)})
    scaler = StandardScaler()
    feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    drop_col_transformer = DropColumns(columns=['col_one', 'col_two'])
    drop_null_transformer = DropNullColumns()
    datetime = DateTimeFeaturizer()
    text_featurizer = TextFeaturizer()
    lsa = LSA()
    pca = PCA()
    lda = LinearDiscriminantAnalysis()
    ft = DFSTransformer()
    us = Undersampler()
    assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10,
                                                                                        'features_to_encode': None,
                                                                                        'categories': None,
                                                                                        'drop': 'if_binary',
                                                                                        'handle_unknown': 'ignore',
                                                                                        'handle_missing': 'error'}}
    assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent",
                                                                                    'categorical_fill_value': None,
                                                                                    'numeric_impute_strategy': "mean",
                                                                                    'numeric_fill_value': None}}
    assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}}
    assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}}
    assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}}
    assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}}
    assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}}
    assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component',
                                                   'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'],
                                                                  'encode_as_categories': False}}
    assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}}
    assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}}
    assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}}
    assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}}
    assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}}
    assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"balanced_ratio": 4, "min_samples": 100, "min_percentage": 0.1}}
    # testing estimators
    base_classifier = BaselineClassifier()
    base_regressor = BaselineRegressor()
    lr_classifier = LogisticRegressionClassifier()
    en_classifier = ElasticNetClassifier()
    en_regressor = ElasticNetRegressor()
    et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto")
    et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto")
    rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
    rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
    linear_regressor = LinearRegressor()
    svm_classifier = SVMClassifier()
    svm_regressor = SVMRegressor()
    assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}}
    assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}}
    assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}}
    assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}}
    assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}}
    assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}}
    assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}}
    assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}}
    try:
        xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
        assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
    except ImportError:
        pass
    try:
        cb_classifier = CatBoostClassifier()
        cb_regressor = CatBoostRegressor()
        assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}}
        assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}}
    except ImportError:
        pass
    try:
        lg_classifier = LightGBMClassifier()
        lg_regressor = LightGBMRegressor()
        assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31,
                                                                                                          'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
        assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31,
                                                                                                        'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
    except ImportError:
        pass
示例#27
0
def test_make_pipeline_from_components(X_y_binary, logistic_regression_binary_pipeline_class):
    with pytest.raises(ValueError, match="Pipeline needs to have an estimator at the last position of the component list"):
        make_pipeline_from_components([Imputer()], problem_type='binary')

    with pytest.raises(KeyError, match="Problem type 'invalid_type' does not exist"):
        make_pipeline_from_components([RandomForestClassifier()], problem_type='invalid_type')

    with pytest.raises(TypeError, match="Custom pipeline name must be a string"):
        make_pipeline_from_components([RandomForestClassifier()], problem_type='binary', custom_name=True)

    with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"):
        make_pipeline_from_components([RandomForestClassifier], problem_type='binary')

    with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"):
        make_pipeline_from_components(['RandomForestClassifier'], problem_type='binary')

    imp = Imputer(numeric_impute_strategy='median', random_seed=5)
    est = RandomForestClassifier(random_seed=7)
    pipeline = make_pipeline_from_components([imp, est], ProblemTypes.BINARY, custom_name='My Pipeline',
                                             random_seed=15)
    assert [c.__class__ for c in pipeline] == [Imputer, RandomForestClassifier]
    assert [(c.random_seed == 15) for c in pipeline]
    assert pipeline.problem_type == ProblemTypes.BINARY
    assert pipeline.custom_name == 'My Pipeline'
    expected_parameters = {
        'Imputer': {
            'categorical_impute_strategy': 'most_frequent',
            'numeric_impute_strategy': 'median',
            'categorical_fill_value': None,
            'numeric_fill_value': None},
        'Random Forest Classifier': {
            'n_estimators': 100,
            'max_depth': 6,
            'n_jobs': -1}
    }
    assert pipeline.parameters == expected_parameters
    assert pipeline.random_seed == 15

    class DummyEstimator(Estimator):
        name = "Dummy!"
        model_family = "foo"
        supported_problem_types = [ProblemTypes.BINARY]
        parameters = {'bar': 'baz'}
    random_seed = 42
    pipeline = make_pipeline_from_components([DummyEstimator(random_seed=3)], ProblemTypes.BINARY,
                                             random_seed=random_seed)
    components_list = [c for c in pipeline]
    assert len(components_list) == 1
    assert isinstance(components_list[0], DummyEstimator)
    assert components_list[0].random_seed == random_seed
    expected_parameters = {'Dummy!': {'bar': 'baz'}}
    assert pipeline.parameters == expected_parameters
    assert pipeline.random_seed == random_seed

    X, y = X_y_binary
    pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}},
                                                         random_seed=42)
    component_instances = [c for c in pipeline]
    new_pipeline = make_pipeline_from_components(component_instances, ProblemTypes.BINARY)
    pipeline.fit(X, y)
    predictions = pipeline.predict(X)
    new_pipeline.fit(X, y)
    new_predictions = new_pipeline.predict(X)
    assert np.array_equal(predictions, new_predictions)
    assert np.array_equal(pipeline.feature_importance, new_pipeline.feature_importance)
    assert new_pipeline.name == 'Templated Pipeline'
    assert pipeline.parameters == new_pipeline.parameters
    for component, new_component in zip(pipeline._component_graph, new_pipeline._component_graph):
        assert isinstance(new_component, type(component))
    assert pipeline.describe() == new_pipeline.describe()
示例#28
0
def test_invalid_strategy_parameters():
    with pytest.raises(ValueError, match="Valid impute strategies are"):
        Imputer(numeric_impute_strategy="not a valid strategy")
    with pytest.raises(ValueError,
                       match="Valid categorical impute strategies are"):
        Imputer(categorical_impute_strategy="mean")