Пример #1
0
def test_invalid_inputs():
    with pytest.raises(ValueError,
                       match="Invalid input 'test' for handle_unknown"):
        TargetEncoder(handle_unknown='test')
    with pytest.raises(ValueError,
                       match="Invalid input 'test2' for handle_missing"):
        TargetEncoder(handle_missing='test2')
    with pytest.raises(
            ValueError,
            match="Smoothing value needs to be strictly larger than 0"):
        TargetEncoder(smoothing=0)
Пример #2
0
def test_transform():
    X = pd.DataFrame({
        'col_1': [1, 2, 1, 1, 2],
        'col_2': ["r", "t", "s", "t", "t"],
        'col_3': ["a", "a", "a", "b", "a"]
    })
    y = pd.Series([0, 1, 1, 1, 0])
    encoder = TargetEncoder()
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1': [1, 2, 1, 1, 2],
        'col_2': [0.6, 0.65872, 0.6, 0.65872, 0.65872],
        'col_3': [0.504743, 0.504743, 0.504743, 0.6, 0.504743]
    })
    pd.testing.assert_frame_equal(X_t, X_expected)
Пример #3
0
def test_parameters():
    encoder = TargetEncoder(cols=['a'])
    expected_parameters = {
        "cols": ['a'],
        "smoothing": 1.0,
        "handle_unknown": "value",
        "handle_missing": "value"
    }
    assert encoder.parameters == expected_parameters
Пример #4
0
def test_init():
    parameters = {
        "cols": None,
        "smoothing": 1.0,
        "handle_unknown": "value",
        "handle_missing": "value"
    }
    encoder = TargetEncoder()
    assert encoder.parameters == parameters
Пример #5
0
def test_target_encoder_woodwork_custom_overrides_returned_by_components(X_df):
    y = pd.Series([1, 2, 1])
    override_types = [
        Integer, Double, Categorical, NaturalLanguage, Boolean, Datetime
    ]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        encoder = TargetEncoder()
        encoder.fit(X, y)
        transformed = encoder.transform(X, y)
        assert isinstance(transformed, ww.DataTable)

        if logical_type == Categorical:
            assert transformed.logical_types == {0: Double}
        else:
            assert transformed.logical_types == {0: logical_type}
Пример #6
0
def test_pandas_numpy(mock_fit, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X).sample(frac=1)

    encoder = TargetEncoder()

    encoder.fit(X, y)
    assert_frame_equal(mock_fit.call_args[0][0], X)

    X_numpy = X.to_numpy()
    encoder.fit(X_numpy, y)
Пример #7
0
def test_get_feature_names():
    X = pd.DataFrame({
        'col_1': [1, 2, 1, 1, 2],
        'col_2': ["r", "t", "s", "t", "t"],
        'col_3': ["a", "a", "a", "b", "a"]
    })
    y = pd.Series([0, 1, 1, 1, 0])
    encoder = TargetEncoder()
    with pytest.raises(
            ComponentNotYetFittedError,
            match='This TargetEncoder is not fitted yet. You must fit'):
        encoder.get_feature_names()
    encoder.fit(X, y)
    np.testing.assert_array_equal(encoder.get_feature_names(),
                                  np.array(['col_1', 'col_2', 'col_3']))
Пример #8
0
def test_pandas_numpy(mock_fit, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X).sample(frac=1)

    encoder = TargetEncoder()
    X_t = pd.DataFrame(X).reset_index(drop=True, inplace=False)

    encoder.fit(X, y)
    assert_frame_equal(mock_fit.call_args[0][0], X_t)

    X_numpy = X.to_numpy()
    encoder.fit(X_numpy, y)
Пример #9
0
def test_null_values_in_dataframe():
    X = pd.DataFrame({
        'col_1': ["a", "b", "c", "d", np.nan],
        'col_2': ["a", "b", "a", "c", "b"],
        'col_3': ["a", "a", "a", "a", "a"]
    })
    y = pd.Series([0, 1, 1, 1, 0])
    encoder = TargetEncoder(handle_missing='value')
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1': [0.6, 0.6, 0.6, 0.6, 0.6],
        'col_2': [0.526894, 0.526894, 0.526894, 0.6, 0.526894],
        'col_3': [
            0.6,
            0.6,
            0.6,
            0.6,
            0.6,
        ]
    })

    assert_frame_equal(X_expected, X_t.to_dataframe())

    encoder = TargetEncoder(handle_missing='return_nan')
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1': [0.6, 0.6, 0.6, 0.6, np.nan],
        'col_2': [0.526894, 0.526894, 0.526894, 0.6, 0.526894],
        'col_3': [
            0.6,
            0.6,
            0.6,
            0.6,
            0.6,
        ]
    })
    assert_frame_equal(X_expected, X_t.to_dataframe())

    encoder = TargetEncoder(handle_missing='error')
    with pytest.raises(ValueError,
                       match='Columns to be encoded can not contain null'):
        encoder.fit(X, y)
Пример #10
0
def test_categories():
    encoder = TargetEncoder()
    with pytest.raises(AttributeError,
                       match="'TargetEncoder' object has no attribute"):
        encoder.categories
Пример #11
0
def test_smoothing():
    # larger smoothing values should bring the values closer to the global mean
    X = pd.DataFrame({
        'col_1': [1, 2, 1, 1, 2],
        'col_2': [2, 1, 1, 1, 1],
        'col_3': ["a", "a", "a", "a", "b"]
    })
    y = pd.Series([0, 1, 1, 1, 0])
    encoder = TargetEncoder(smoothing=1)
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1':
        pd.Series([1, 2, 1, 1, 2], dtype="Int64"),
        'col_2':
        pd.Series([2, 1, 1, 1, 1], dtype="Int64"),
        'col_3': [0.742886, 0.742886, 0.742886, 0.742886, 0.6]
    })
    assert_frame_equal(X_expected, X_t.to_dataframe())

    encoder = TargetEncoder(smoothing=10)
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1':
        pd.Series([1, 2, 1, 1, 2], dtype="Int64"),
        'col_2':
        pd.Series([2, 1, 1, 1, 1], dtype="Int64"),
        'col_3': [0.686166, 0.686166, 0.686166, 0.686166, 0.6]
    })
    assert_frame_equal(X_expected, X_t.to_dataframe())

    encoder = TargetEncoder(smoothing=100)
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1':
        pd.Series([1, 2, 1, 1, 2], dtype="Int64"),
        'col_2':
        pd.Series([2, 1, 1, 1, 1], dtype="Int64"),
        'col_3': [0.676125, 0.676125, 0.676125, 0.676125, 0.6]
    })
    assert_frame_equal(X_expected, X_t.to_dataframe())
Пример #12
0
def test_cols():
    X = pd.DataFrame({
        'col_1': [1, 2, 1, 1, 2],
        'col_2': ['2', '1', '1', '1', '1'],
        'col_3': ["a", "a", "a", "a", "a"]
    })
    X_expected = X.astype({
        'col_1': 'Int64',
        'col_2': 'category',
        'col_3': 'category'
    })
    y = pd.Series([0, 1, 1, 1, 0])
    encoder = TargetEncoder(cols=[])
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())

    encoder = TargetEncoder(cols=['col_2'])
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1':
        pd.Series([1, 2, 1, 1, 2], dtype="Int64"),
        'col_2': [0.60000, 0.742886, 0.742886, 0.742886, 0.742886],
        'col_3':
        pd.Series(["a", "a", "a", "a", "a"], dtype="category")
    })
    assert_frame_equal(X_expected, X_t.to_dataframe(), check_less_precise=True)

    encoder = TargetEncoder(cols=['col_2', 'col_3'])
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    encoder2 = TargetEncoder()
    encoder2.fit(X, y)
    X_t2 = encoder2.transform(X)
    assert_frame_equal(X_t.to_dataframe(), X_t2.to_dataframe())
Пример #13
0
def test_cols():
    X = pd.DataFrame({
        'col_1': [1, 2, 1, 1, 2],
        'col_2': ['2', '1', '1', '1', '1'],
        'col_3': ["a", "a", "a", "a", "a"]
    })
    y = pd.Series([0, 1, 1, 1, 0])
    encoder = TargetEncoder(cols=[])
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    pd.testing.assert_frame_equal(X, X_t)

    encoder = TargetEncoder(cols=['col_2'])
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    X_expected = pd.DataFrame({
        'col_1': [1, 2, 1, 1, 2],
        'col_2': [0.60000, 0.742886, 0.742886, 0.742886, 0.742886],
        'col_3': ["a", "a", "a", "a", "a"]
    })
    pd.testing.assert_frame_equal(X_t, X_expected, check_less_precise=True)

    encoder = TargetEncoder(cols=['col_2', 'col_3'])
    encoder.fit(X, y)
    X_t = encoder.transform(X)
    encoder2 = TargetEncoder()
    encoder2.fit(X, y)
    X_t2 = encoder2.transform(X)
    pd.testing.assert_frame_equal(X_t, X_t2)