예제 #1
0
def test_ohe_features_to_encode():
    # Test feature that doesn't need encoding and
    # feature that needs encoding but is not specified remain untouched
    X = pd.DataFrame({
        "col_1": [2, 0, 1, 0, 0],
        "col_2": ['a', 'b', 'a', 'c', 'd']
    })

    encoder = OneHotEncoder(top_n=5, features_to_encode=['col_1'])
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    expected_col_names = set(['col_1_0', 'col_1_1', 'col_1_2', 'col_2'])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])

    encoder = OneHotEncoder(top_n=5, features_to_encode=['col_1', 'col_2'])
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    expected_col_names = set([
        'col_1_0', 'col_1_1', 'col_1_2', 'col_2_a', 'col_2_b', 'col_2_c',
        'col_2_d'
    ])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])
예제 #2
0
def test_get_component(example_graph):
    graph = example_graph
    component_graph = ComponentGraph(graph)

    assert component_graph.get_component('OneHot_ElasticNet') == OneHotEncoder
    assert component_graph.get_component(
        'Logistic Regression') == LogisticRegressionClassifier

    with pytest.raises(ValueError, match='not in the graph'):
        component_graph.get_component('Fake Component')

    component_graph.instantiate({
        'OneHot_RandomForest': {
            'top_n': 3
        },
        'Random Forest': {
            'max_depth': 4,
            'n_estimators': 50
        }
    })
    assert component_graph.get_component(
        'OneHot_ElasticNet') == OneHotEncoder()
    assert component_graph.get_component(
        'OneHot_RandomForest') == OneHotEncoder(top_n=3)
    assert component_graph.get_component(
        'Random Forest') == RandomForestClassifier(n_estimators=50,
                                                   max_depth=4)
예제 #3
0
def test_categories():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g"],
        "col_2": ["a", "c", "d", "b", "e", "e", "f"],
        "col_3": ["a", "a", "a", "a", "a", "a", "b"],
        "col_4": [2, 0, 1, 3, 0, 1, 2]
    })

    categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]]

    # test categories value works
    encoder = OneHotEncoder(top_n=None, categories=categories, random_seed=2)
    encoder.fit(X)
    X_t = encoder.transform(X)

    col_names = set(X_t.columns)
    expected_col_names = set([
        "col_1_a", "col_1_b", "col_1_c", "col_1_d", "col_2_a", "col_2_b",
        "col_2_c", "col_3_a", "col_3_b", "col_4"
    ])
    assert (X_t.shape == (7, 10))
    assert (col_names == expected_col_names)

    # test categories with top_n errors
    with pytest.raises(
            ValueError,
            match="Cannot use categories and top_n arguments simultaneously"):
        encoder = OneHotEncoder(top_n=10, categories=categories, random_seed=2)
예제 #4
0
def test_ohe_column_names_unique():
    df = pd.DataFrame({"A": ["x_y"], "A_x": ["y"]})
    df_transformed = OneHotEncoder().fit_transform(df)
    assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1"}

    df = pd.DataFrame({
        "A": ["x_y", "z", "z"],
        "A_x": [
            "y",
            "a",
            "a",
        ],
        "A_x_y": ["1", "y", "y"]
    })
    df_transformed = OneHotEncoder().fit_transform(df)
    # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists
    # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists
    assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1", "A_x_y_1_1"}

    df = pd.DataFrame({
        "A": ["x_y", "z", "a"],
        "A_x": ["y_1", "y", "b"],
        "A_x_y": ["1", "y", "c"]
    })
    df_transformed = OneHotEncoder().fit_transform(df)
    # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists
    # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists
    # category 1 in A_x_y gets mapped to A_x_y_1_2 because A_x_y_1_1 already exists
    assert set(df_transformed.columns) == {
        "A_x_y", "A_z", "A_a", "A_x_y_1", "A_x_y_1_1", "A_x_b", "A_x_y_1_2",
        "A_x_y_y", "A_x_y_c"
    }
예제 #5
0
def test_iteration(example_graph):
    component_graph = ComponentGraph(example_graph)

    expected = [Imputer, OneHotEncoder, ElasticNetClassifier, OneHotEncoder, RandomForestClassifier, LogisticRegressionClassifier]
    iteration = [component for component in component_graph]
    assert iteration == expected

    component_graph.instantiate({'OneHot_RandomForest': {'top_n': 32}})
    expected = [Imputer(), OneHotEncoder(), ElasticNetClassifier(), OneHotEncoder(top_n=32), RandomForestClassifier(), LogisticRegressionClassifier()]
    iteration = [component for component in component_graph]
    assert iteration == expected
def test_null_values_in_dataframe():
    X = pd.DataFrame({'col_1': ["a", "b", "c", "d", np.nan],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})

    # Test NaN will be counted as a category if within the top_n
    encoder = OneHotEncoder(handle_missing='as_category')
    encoder.fit(X)
    X_t = encoder.transform(X)

    expected_col_names = set(["col_1_a", "col_1_b", "col_1_c", "col_1_d", "col_1_nan",
                              "col_2_a", "col_2_b", "col_2_c", "col_3_a"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert X_t.shape == (5, 9)

    # Test NaN will not be counted as a category if not in the top_n
    X = pd.DataFrame({'col_1': ["a", "a", "c", "c", np.nan],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"],
                      'col_4': [2, 0, 1, np.nan, 0]})

    encoder = OneHotEncoder(top_n=2, handle_missing='as_category')
    encoder.fit(X)
    X_t = encoder.transform(X)

    expected_col_names = set(["col_1_a", "col_1_c", "col_2_a", "col_2_b", "col_3_a", "col_4"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert X_t.shape == (5, 6)

    # Test handle_missing='error' throws an error
    encoder = OneHotEncoder(handle_missing='error')

    X = pd.DataFrame({"col_1": [np.nan, "b", "c", "d", "e", "f", "g"],
                      "col_2": ["a", "c", "d", "b", "e", "e", "f"],
                      "col_3": ["a", "a", "a", "a", "a", "a", "b"],
                      "col_4": [2, 0, 1, 3, 0, 1, 2]})

    with pytest.raises(ValueError, match="Input contains NaN"):
        encoder.fit(X)

    # Test NaN values in transformed data
    X = pd.DataFrame({'col_1': ["a", "b", "c", "d", "d"],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})
    encoder = OneHotEncoder(handle_missing='error')
    encoder.fit(X)
    X_missing = pd.DataFrame({'col_1': ["a", "b", "c", "d", "d"],
                              'col_2': ["a", "b", np.nan, "c", "b"],
                              'col_3': ["a", "a", "a", "a", "a"]})
    with pytest.raises(ValueError, match="Input contains NaN"):
        encoder.transform(X_missing)
예제 #7
0
def test_no_top_n():
    # test all categories in all columns are encoded when top_n is None
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
        "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"],
        "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2]
    })
    expected_col_names = set(["col_3_b", "col_4"])
    for val in X["col_1"]:
        expected_col_names.add("col_1_" + val)
    for val in X["col_2"]:
        expected_col_names.add("col_2_" + val)

    encoder = OneHotEncoder(top_n=None, handle_unknown="error", random_seed=2)
    encoder.fit(X)
    X_t = encoder.transform(X)

    col_names = set(X_t.columns)
    assert (X_t.shape == (11, 19))
    assert (col_names == expected_col_names)

    # Make sure unknown values cause an error
    X_new = pd.DataFrame({
        "col_1": ["a", "b", "c", "x"],
        "col_2": ["a", "c", "d", "b"],
        "col_3": ["a", "a", "a", "a"],
        "col_4": [2, 0, 1, 3]
    })

    with pytest.raises(ValueError) as exec_info:
        encoder.transform(X_new)
    assert "Found unknown categories" in exec_info.value.args[0]
예제 #8
0
def test_more_top_n_unique_values_large():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
        "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"],
        "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]
    })

    random_seed = 2

    encoder = OneHotEncoder(top_n=3, random_seed=random_seed)
    encoder.fit(X)
    X_t = encoder.transform(X)

    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
    X = infer_feature_types(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
    col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
    col_1_counts = col_1_counts.sort_values(["col_1"],
                                            ascending=False,
                                            kind='mergesort')
    col_1_samples = col_1_counts.head(
        encoder.parameters['top_n']).index.tolist()
    expected_col_names = set([
        "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c",
        "col_4"
    ])
    for val in col_1_samples:
        expected_col_names.add("col_1_" + val)

    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
def test_ohe_features_to_encode_col_missing():
    X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0],
                      "col_2": ['a', 'b', 'a', 'c', 'd']})

    encoder = OneHotEncoder(top_n=5, features_to_encode=['col_3', 'col_4'])

    with pytest.raises(ValueError, match="Could not find and encode"):
        encoder.fit(X)
예제 #10
0
def test_ohe_get_feature_names():
    X = pd.DataFrame({'col_1': ['a'] * 10,
                      'col_2': ['a'] * 3 + ['b'] * 3 + ['c'] * 2 + ['d'] * 2})
    ohe = OneHotEncoder(top_n=2)
    with pytest.raises(ComponentNotYetFittedError, match='This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling get_feature_names.'):
        ohe.get_feature_names()
    ohe.fit(X)
    np.testing.assert_array_equal(ohe.get_feature_names(), np.array(['col_1_a', 'col_2_a', 'col_2_b']))
예제 #11
0
def test_ohe_preserves_custom_index(index):

    df = pd.DataFrame({"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)},
                      index=index)
    ohe = OneHotEncoder()
    new_df = ohe.fit_transform(df)
    pd.testing.assert_index_equal(new_df.index, df.index)
    assert not new_df.isna().any(axis=None)
예제 #12
0
def test_ohe_features_to_encode_no_col_names():
    X = pd.DataFrame([["b", 0], ["a", 1], ["b", 1]])
    encoder = OneHotEncoder(top_n=5, features_to_encode=[0])
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    expected_col_names = set([1, "0_a"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])
예제 #13
0
def test_init():
    parameters = {'top_n': 10,
                  'features_to_encode': None,
                  'categories': None,
                  'drop': None,
                  'handle_unknown': 'ignore',
                  'handle_missing': 'error'}
    encoder = OneHotEncoder()
    assert encoder.parameters == parameters
예제 #14
0
def test_drop_parameter_is_array():
    X = pd.DataFrame({'col_1': ["a", "b", "b", "a", "b"],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})
    encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown='error')
    encoder.fit(X)
    X_t = encoder.transform(X)
    col_names = set(X_t.columns)
    expected_col_names = {"col_1_a", "col_2_a", "col_2_b"}
    assert col_names == expected_col_names
예제 #15
0
def test_all_numerical_dtype():
    # test that columns with the numerical type are preserved
    X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0],
                      "col_2": [3, 2, 5, 1, 3],
                      "col_3": [0, 0, 1, 3, 2],
                      "col_4": [2, 4, 1, 4, 0]})

    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)
    assert X.equals(X_t)
예제 #16
0
def test_drop_binary():
    X = pd.DataFrame({'col_1': ["a", "b", "b", "a", "b"],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})
    encoder = OneHotEncoder(top_n=None, drop='if_binary', handle_unknown='error')
    encoder.fit(X)
    X_t = encoder.transform(X)
    col_names = set(X_t.columns)
    expected_col_names = set(["col_1_b", "col_2_a",
                              "col_2_b", "col_2_c", "col_3_a"])
    assert col_names == expected_col_names
예제 #17
0
def test_parameters():
    encoder = OneHotEncoder(top_n=123)
    expected_parameters = {
        'top_n': 123,
        'features_to_encode': None,
        'categories': None,
        'drop': 'if_binary',
        'handle_unknown': 'ignore',
        'handle_missing': 'error'
    }
    assert encoder.parameters == expected_parameters
예제 #18
0
def test_invalid_inputs():
    error_msg = "Invalid input {} for handle_missing".format("peanut butter")
    with pytest.raises(ValueError, match=error_msg):
        encoder = OneHotEncoder(handle_missing="peanut butter")

    error_msg = "Invalid input {} for handle_unknown".format("bananas")
    with pytest.raises(ValueError, match=error_msg):
        encoder = OneHotEncoder(handle_unknown="bananas")

    X = pd.DataFrame({'col_1': ["a", "b", "c", "d", "a"],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})
    encoder = OneHotEncoder(top_n=None, categories=[["a", "b"], ["a", "c"]])
    error_msg = "Categories argument must contain a list of categories for each categorical feature"
    with pytest.raises(ValueError, match=error_msg):
        encoder.fit(X)

    encoder = OneHotEncoder(top_n=None, categories=["a", "b", "c"])
    error_msg = "Categories argument must contain a list of categories for each categorical feature"
    with pytest.raises(ValueError, match=error_msg):
        encoder.fit(X)
예제 #19
0
def test_ohe_categories():
    X = pd.DataFrame({'col_1': ['a'] * 10,
                      'col_2': ['a'] * 3 + ['b'] * 3 + ['c'] * 2 + ['d'] * 2})
    ohe = OneHotEncoder(top_n=2)
    with pytest.raises(ComponentNotYetFittedError, match='This OneHotEncoder is not fitted yet. You must fit OneHotEncoder before calling categories.'):
        ohe.categories('col_1')

    ohe.fit(X)
    np.testing.assert_array_equal(ohe.categories('col_1'), np.array(['a']))
    np.testing.assert_array_equal(ohe.categories('col_2'), np.array(['a', 'b']))
    with pytest.raises(ValueError, match='Feature "col_12345" was not provided to one-hot encoder as a training feature'):
        ohe.categories('col_12345')
예제 #20
0
def test_all_numerical_dtype():
    # test that columns with the numerical type are preserved
    X = pd.DataFrame({
        "col_1": [2, 0, 1, 0, 0],
        "col_2": [3, 2, 5, 1, 3],
        "col_3": [0, 0, 1, 3, 2],
        "col_4": [2, 4, 1, 4, 0]
    })
    X_expected = X.astype("Int64")
    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
예제 #21
0
def test_less_than_top_n_unique_values():
    # test that columns with less than n unique values encodes properly
    X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "a"],
                      "col_2": ["a", "b", "a", "c", "b"],
                      "col_3": ["a", "a", "a", "a", "a"],
                      "col_4": [2, 0, 1, 0, 0]})

    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)
    expected_col_names = set(["col_1_a", "col_1_b", "col_1_c", "col_1_d",
                              "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_4"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
예제 #22
0
def test_large_number_of_categories():
    n_categories = 200000
    frequency_per_category = 5
    X = np.repeat(np.arange(n_categories), frequency_per_category).reshape((-1, 1))
    X_extra = np.repeat(np.arange(10) + n_categories, 10).reshape((-1, 1))
    X = np.array(np.concatenate([X, X_extra]))
    X = pd.DataFrame(X, columns=['cat'])
    X['cat'] = X['cat'].astype('category')
    encoder = OneHotEncoder(top_n=10)
    encoder.fit(X)
    X_t = encoder.transform(X)
    expected_col_names = ['cat_' + str(200000 + i) for i in range(10)]
    assert X_t.shape == (1000100, 10)
    assert set(expected_col_names) == set(list(X_t.columns))
예제 #23
0
def test_drop_binary_and_top_n_2():
    # Test that columns that originally had two values have one column dropped,
    # but columns that end up with two values keep both values
    X = pd.DataFrame({
        'col_1': ["a", "b", "b", "a", "b"],
        'col_2': ["a", "b", "a", "c", "b"],
        'col_3': ["a", "a", "a", "a", "a"]
    })
    encoder = OneHotEncoder(top_n=2, drop='if_binary')
    encoder.fit(X)
    X_t = encoder.transform(X)
    col_names = set(X_t.columns)
    expected_col_names = set(["col_1_a", "col_2_a", "col_2_b", "col_3_a"])
    assert col_names == expected_col_names
예제 #24
0
def test_data_types(data_type):
    if data_type == 'list':
        X = [["a"], ["b"], ["c"]]
    elif data_type == 'np':
        X = np.array([["a"], ["b"], ["c"]])
    elif data_type == 'pd_no_index':
        X = pd.DataFrame(["a", "b", "c"])
    elif data_type == 'pd_index':
        X = pd.DataFrame(["a", "b", "c"], columns=['0'])
    elif data_type == 'ww':
        X = ww.DataTable(pd.DataFrame(["a", "b", "c"]))
    encoder = OneHotEncoder()
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    assert list(X_t.columns) == ['0_a', '0_b', '0_c']
    np.testing.assert_array_equal(X_t.to_numpy(), np.identity(3))
예제 #25
0
def test_ohe_output_bools():
    X = ww.DataTable(
        pd.DataFrame({
            "bool": [bool(i % 2) for i in range(100)],
            "categorical": ["dog"] * 20 + ["cat"] * 40 + ["fish"] * 40,
            "integers": [i for i in range(100)]
        }))
    y = ww.DataColumn(pd.Series([i % 2 for i in range(100)]))
    ohe = OneHotEncoder()
    output = ohe.fit_transform(X, y)
    for name, types in output.types["Logical Type"].items():
        if name == 'integers':
            assert str(types) == "Integer"
        else:
            assert str(types) == "Boolean"
    assert len(output.columns) == 5
예제 #26
0
def test_ohe_woodwork_custom_overrides_returned_by_components(X_df):
    y = pd.Series([1, 2, 1])
    override_types = [
        Integer, Double, Categorical, NaturalLanguage, Datetime, Boolean
    ]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        ohe = OneHotEncoder()
        ohe.fit(X, y)
        transformed = ohe.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        if logical_type != Categorical:
            assert transformed.logical_types == {0: logical_type}
예제 #27
0
def test_handle_unknown():
    X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g"],
                      "col_2": ["a", "c", "d", "b", "e", "e", "f"],
                      "col_3": ["a", "a", "a", "a", "a", "a", "b"],
                      "col_4": [2, 0, 1, 3, 0, 1, 2]})

    encoder = OneHotEncoder(handle_unknown='error')
    encoder.fit(X)
    assert isinstance(encoder.transform(X), pd.DataFrame)

    X = pd.DataFrame({"col_1": ["x", "b", "c", "d", "e", "f", "g"],
                      "col_2": ["a", "c", "d", "b", "e", "e", "f"],
                      "col_3": ["a", "a", "a", "a", "a", "a", "b"],
                      "col_4": [2, 0, 1, 3, 0, 1, 2]})
    with pytest.raises(ValueError) as exec_info:
        encoder.transform(X)
    assert "Found unknown categories" in exec_info.value.args[0]
예제 #28
0
def test_categorical_dtype():
    # test that columns with the categorical type are encoded properly
    X = pd.DataFrame({"col_1": ["f", "b", "c", "d", "e"],
                      "col_2": ["a", "e", "d", "d", "e"],
                      "col_3": ["a", "a", "a", "a", "a"],
                      "col_4": [3, 3, 2, 2, 1]})
    X["col_4"] = X["col_4"].astype('category')

    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)

    expected_col_names = set(["col_1_f", "col_1_b", "col_1_c", "col_1_d", "col_1_e",
                              "col_2_d", "col_2_e", "col_2_a", "col_3_a",
                              "col_4_1", "col_4_2", "col_4_3"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])
예제 #29
0
 def check_df_equality(random_seed):
     ohe = OneHotEncoder(top_n=4, random_seed=random_seed)
     df1 = ohe.fit_transform(df).to_dataframe()
     df2 = ohe.fit_transform(df).to_dataframe()
     assert_frame_equal(df1, df2)
예제 #30
0
def test_numpy_input():
    X = np.array([[2, 0, 1, 0, 0], [3, 2, 5, 1, 3]])
    encoder = OneHotEncoder()
    encoder.fit(X)
    X_t = encoder.transform(X)
    assert_frame_equal(pd.DataFrame(X), X_t.to_dataframe(), check_dtype=False)