예제 #1
0
def test_ohe_features_to_encode():
    # Test feature that doesn't need encoding and
    # feature that needs encoding but is not specified remain untouched
    X = pd.DataFrame({
        "col_1": [2, 0, 1, 0, 0],
        "col_2": ['a', 'b', 'a', 'c', 'd']
    })

    encoder = OneHotEncoder(top_n=5, features_to_encode=['col_1'])
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    expected_col_names = set(['col_1_0', 'col_1_1', 'col_1_2', 'col_2'])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])

    encoder = OneHotEncoder(top_n=5, features_to_encode=['col_1', 'col_2'])
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    expected_col_names = set([
        'col_1_0', 'col_1_1', 'col_1_2', 'col_2_a', 'col_2_b', 'col_2_c',
        'col_2_d'
    ])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])
예제 #2
0
def test_no_top_n():
    # test all categories in all columns are encoded when top_n is None
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
        "col_2": ["a", "c", "d", "b", "e", "e", "f", "a", "b", "c", "d"],
        "col_3": ["a", "a", "a", "a", "a", "a", "b", "a", "a", "b", "b"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2]
    })
    expected_col_names = set(["col_3_b", "col_4"])
    for val in X["col_1"]:
        expected_col_names.add("col_1_" + val)
    for val in X["col_2"]:
        expected_col_names.add("col_2_" + val)

    encoder = OneHotEncoder(top_n=None, handle_unknown="error", random_seed=2)
    encoder.fit(X)
    X_t = encoder.transform(X)

    col_names = set(X_t.columns)
    assert (X_t.shape == (11, 19))
    assert (col_names == expected_col_names)

    # Make sure unknown values cause an error
    X_new = pd.DataFrame({
        "col_1": ["a", "b", "c", "x"],
        "col_2": ["a", "c", "d", "b"],
        "col_3": ["a", "a", "a", "a"],
        "col_4": [2, 0, 1, 3]
    })

    with pytest.raises(ValueError) as exec_info:
        encoder.transform(X_new)
    assert "Found unknown categories" in exec_info.value.args[0]
def test_null_values_in_dataframe():
    X = pd.DataFrame({'col_1': ["a", "b", "c", "d", np.nan],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})

    # Test NaN will be counted as a category if within the top_n
    encoder = OneHotEncoder(handle_missing='as_category')
    encoder.fit(X)
    X_t = encoder.transform(X)

    expected_col_names = set(["col_1_a", "col_1_b", "col_1_c", "col_1_d", "col_1_nan",
                              "col_2_a", "col_2_b", "col_2_c", "col_3_a"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert X_t.shape == (5, 9)

    # Test NaN will not be counted as a category if not in the top_n
    X = pd.DataFrame({'col_1': ["a", "a", "c", "c", np.nan],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"],
                      'col_4': [2, 0, 1, np.nan, 0]})

    encoder = OneHotEncoder(top_n=2, handle_missing='as_category')
    encoder.fit(X)
    X_t = encoder.transform(X)

    expected_col_names = set(["col_1_a", "col_1_c", "col_2_a", "col_2_b", "col_3_a", "col_4"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert X_t.shape == (5, 6)

    # Test handle_missing='error' throws an error
    encoder = OneHotEncoder(handle_missing='error')

    X = pd.DataFrame({"col_1": [np.nan, "b", "c", "d", "e", "f", "g"],
                      "col_2": ["a", "c", "d", "b", "e", "e", "f"],
                      "col_3": ["a", "a", "a", "a", "a", "a", "b"],
                      "col_4": [2, 0, 1, 3, 0, 1, 2]})

    with pytest.raises(ValueError, match="Input contains NaN"):
        encoder.fit(X)

    # Test NaN values in transformed data
    X = pd.DataFrame({'col_1': ["a", "b", "c", "d", "d"],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})
    encoder = OneHotEncoder(handle_missing='error')
    encoder.fit(X)
    X_missing = pd.DataFrame({'col_1': ["a", "b", "c", "d", "d"],
                              'col_2': ["a", "b", np.nan, "c", "b"],
                              'col_3': ["a", "a", "a", "a", "a"]})
    with pytest.raises(ValueError, match="Input contains NaN"):
        encoder.transform(X_missing)
예제 #4
0
def test_categories():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g"],
        "col_2": ["a", "c", "d", "b", "e", "e", "f"],
        "col_3": ["a", "a", "a", "a", "a", "a", "b"],
        "col_4": [2, 0, 1, 3, 0, 1, 2]
    })

    categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]]

    # test categories value works
    encoder = OneHotEncoder(top_n=None, categories=categories, random_seed=2)
    encoder.fit(X)
    X_t = encoder.transform(X)

    col_names = set(X_t.columns)
    expected_col_names = set([
        "col_1_a", "col_1_b", "col_1_c", "col_1_d", "col_2_a", "col_2_b",
        "col_2_c", "col_3_a", "col_3_b", "col_4"
    ])
    assert (X_t.shape == (7, 10))
    assert (col_names == expected_col_names)

    # test categories with top_n errors
    with pytest.raises(
            ValueError,
            match="Cannot use categories and top_n arguments simultaneously"):
        encoder = OneHotEncoder(top_n=10, categories=categories, random_seed=2)
예제 #5
0
def test_more_top_n_unique_values_large():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
        "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"],
        "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]
    })

    random_seed = 2

    encoder = OneHotEncoder(top_n=3, random_seed=random_seed)
    encoder.fit(X)
    X_t = encoder.transform(X)

    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
    X = infer_feature_types(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
    col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
    col_1_counts = col_1_counts.sort_values(["col_1"],
                                            ascending=False,
                                            kind='mergesort')
    col_1_samples = col_1_counts.head(
        encoder.parameters['top_n']).index.tolist()
    expected_col_names = set([
        "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c",
        "col_4"
    ])
    for val in col_1_samples:
        expected_col_names.add("col_1_" + val)

    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
def test_handle_unknown():
    X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g"],
                      "col_2": ["a", "c", "d", "b", "e", "e", "f"],
                      "col_3": ["a", "a", "a", "a", "a", "a", "b"],
                      "col_4": [2, 0, 1, 3, 0, 1, 2]})

    encoder = OneHotEncoder(handle_unknown='error')
    encoder.fit(X)
    assert isinstance(encoder.transform(X), pd.DataFrame)

    X = pd.DataFrame({"col_1": ["x", "b", "c", "d", "e", "f", "g"],
                      "col_2": ["a", "c", "d", "b", "e", "e", "f"],
                      "col_3": ["a", "a", "a", "a", "a", "a", "b"],
                      "col_4": [2, 0, 1, 3, 0, 1, 2]})
    with pytest.raises(ValueError) as exec_info:
        encoder.transform(X)
    assert "Found unknown categories" in exec_info.value.args[0]
예제 #7
0
def test_ohe_features_to_encode_no_col_names():
    X = pd.DataFrame([["b", 0], ["a", 1], ["b", 1]])
    encoder = OneHotEncoder(top_n=5, features_to_encode=[0])
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    expected_col_names = set([1, "0_a"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])
def test_drop_parameter_is_array():
    X = pd.DataFrame({'col_1': ["a", "b", "b", "a", "b"],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})
    encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown='error')
    encoder.fit(X)
    X_t = encoder.transform(X)
    col_names = set(X_t.columns)
    expected_col_names = {"col_1_a", "col_2_a", "col_2_b"}
    assert col_names == expected_col_names
def test_all_numerical_dtype():
    # test that columns with the numerical type are preserved
    X = pd.DataFrame({"col_1": [2, 0, 1, 0, 0],
                      "col_2": [3, 2, 5, 1, 3],
                      "col_3": [0, 0, 1, 3, 2],
                      "col_4": [2, 4, 1, 4, 0]})

    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)
    assert X.equals(X_t)
예제 #10
0
def test_drop_binary():
    X = pd.DataFrame({'col_1': ["a", "b", "b", "a", "b"],
                      'col_2': ["a", "b", "a", "c", "b"],
                      'col_3': ["a", "a", "a", "a", "a"]})
    encoder = OneHotEncoder(top_n=None, drop='if_binary', handle_unknown='error')
    encoder.fit(X)
    X_t = encoder.transform(X)
    col_names = set(X_t.columns)
    expected_col_names = set(["col_1_b", "col_2_a",
                              "col_2_b", "col_2_c", "col_3_a"])
    assert col_names == expected_col_names
예제 #11
0
def test_all_numerical_dtype():
    # test that columns with the numerical type are preserved
    X = pd.DataFrame({
        "col_1": [2, 0, 1, 0, 0],
        "col_2": [3, 2, 5, 1, 3],
        "col_3": [0, 0, 1, 3, 2],
        "col_4": [2, 4, 1, 4, 0]
    })
    X_expected = X.astype("Int64")
    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
예제 #12
0
def test_less_than_top_n_unique_values():
    # test that columns with less than n unique values encodes properly
    X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "a"],
                      "col_2": ["a", "b", "a", "c", "b"],
                      "col_3": ["a", "a", "a", "a", "a"],
                      "col_4": [2, 0, 1, 0, 0]})

    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)
    expected_col_names = set(["col_1_a", "col_1_b", "col_1_c", "col_1_d",
                              "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_4"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
예제 #13
0
def test_large_number_of_categories():
    n_categories = 200000
    frequency_per_category = 5
    X = np.repeat(np.arange(n_categories), frequency_per_category).reshape((-1, 1))
    X_extra = np.repeat(np.arange(10) + n_categories, 10).reshape((-1, 1))
    X = np.array(np.concatenate([X, X_extra]))
    X = pd.DataFrame(X, columns=['cat'])
    X['cat'] = X['cat'].astype('category')
    encoder = OneHotEncoder(top_n=10)
    encoder.fit(X)
    X_t = encoder.transform(X)
    expected_col_names = ['cat_' + str(200000 + i) for i in range(10)]
    assert X_t.shape == (1000100, 10)
    assert set(expected_col_names) == set(list(X_t.columns))
예제 #14
0
def test_drop_binary_and_top_n_2():
    # Test that columns that originally had two values have one column dropped,
    # but columns that end up with two values keep both values
    X = pd.DataFrame({
        'col_1': ["a", "b", "b", "a", "b"],
        'col_2': ["a", "b", "a", "c", "b"],
        'col_3': ["a", "a", "a", "a", "a"]
    })
    encoder = OneHotEncoder(top_n=2, drop='if_binary')
    encoder.fit(X)
    X_t = encoder.transform(X)
    col_names = set(X_t.columns)
    expected_col_names = set(["col_1_a", "col_2_a", "col_2_b", "col_3_a"])
    assert col_names == expected_col_names
예제 #15
0
def test_data_types(data_type):
    if data_type == 'list':
        X = [["a"], ["b"], ["c"]]
    elif data_type == 'np':
        X = np.array([["a"], ["b"], ["c"]])
    elif data_type == 'pd_no_index':
        X = pd.DataFrame(["a", "b", "c"])
    elif data_type == 'pd_index':
        X = pd.DataFrame(["a", "b", "c"], columns=['0'])
    elif data_type == 'ww':
        X = ww.DataTable(pd.DataFrame(["a", "b", "c"]))
    encoder = OneHotEncoder()
    encoder.fit(X)
    X_t = encoder.transform(X).to_dataframe()
    assert list(X_t.columns) == ['0_a', '0_b', '0_c']
    np.testing.assert_array_equal(X_t.to_numpy(), np.identity(3))
예제 #16
0
def test_ohe_woodwork_custom_overrides_returned_by_components(X_df):
    y = pd.Series([1, 2, 1])
    override_types = [
        Integer, Double, Categorical, NaturalLanguage, Datetime, Boolean
    ]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        ohe = OneHotEncoder()
        ohe.fit(X, y)
        transformed = ohe.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        if logical_type != Categorical:
            assert transformed.logical_types == {0: logical_type}
예제 #17
0
def test_categorical_dtype():
    # test that columns with the categorical type are encoded properly
    X = pd.DataFrame({"col_1": ["f", "b", "c", "d", "e"],
                      "col_2": ["a", "e", "d", "d", "e"],
                      "col_3": ["a", "a", "a", "a", "a"],
                      "col_4": [3, 3, 2, 2, 1]})
    X["col_4"] = X["col_4"].astype('category')

    encoder = OneHotEncoder(top_n=5)
    encoder.fit(X)
    X_t = encoder.transform(X)

    expected_col_names = set(["col_1_f", "col_1_b", "col_1_c", "col_1_d", "col_1_e",
                              "col_2_d", "col_2_e", "col_2_a", "col_3_a",
                              "col_4_1", "col_4_2", "col_4_3"])
    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    assert ([X_t[col].dtype == "uint8" for col in X_t])
예제 #18
0
def test_numpy_input():
    X = np.array([[2, 0, 1, 0, 0], [3, 2, 5, 1, 3]])
    encoder = OneHotEncoder()
    encoder.fit(X)
    X_t = encoder.transform(X)
    assert_frame_equal(pd.DataFrame(X), X_t.to_dataframe(), check_dtype=False)