def test_transform_unseen(self, X, y, handle_unseen, expected): enc = TargetEncoder(cols=['cat'], handle_unseen=handle_unseen) X = pd.DataFrame(X, columns=['cat']) enc.fit(X, pd.Series(y)) X.iloc[0, 0] = 'foo' result = enc.transform(X) assert_array_almost_equal(result, pd.DataFrame(expected), decimal=3)
def target_encoding(cls, X, Y=None, encoder=None): cols = ['some_id', 'other_id'] if encoder is None: encoder = TargetEncoder(cols=cols, min_samples=5, smoothing=5) encoder.fit(X, Y) encoded = encoder.transform(X).rename( columns={c: 'tgt_enc_{}'.format(c) for c in cols}) return pd.concat([X[cols], encoded], axis=1), encoder
def test_encode_nans(self, X, y, expected, columns): enc = TargetEncoder(cols=['cat']) result = enc.fit_transform(pd.DataFrame(X, columns=['cat']), pd.Series(y)) assert_array_almost_equal(result, pd.DataFrame(expected), decimal=3) ok_('cat' in enc._mapping) ok_(isinstance(enc._mapping['cat'], pd.DataFrame)) eq_(enc._mapping['cat'].index[0], -99999) assert_array_equal(enc._mapping['cat'].index[1:], columns) assert_array_equal(enc._mapping['cat'].columns, ['mean', 'count', 'value'])
def test_encode_col(self, X, y, min_samples, smoothing, expected, imputed, columns): enc = TargetEncoder(cols=['cat'], min_samples=min_samples, smoothing=smoothing) result = enc.fit_transform(pd.DataFrame(X, columns=['cat']), pd.Series(y)) assert_array_almost_equal(result, pd.DataFrame(expected), decimal=3) eq_(enc._imputed, imputed) ok_('cat' in enc._mapping) ok_(isinstance(enc._mapping['cat'], pd.DataFrame)) assert_array_equal(enc._mapping['cat'].index, columns) assert_array_equal(enc._mapping['cat'].columns, ['mean', 'count', 'value'])
def test_encode_multiple_cols(self, X, y, expected): enc = TargetEncoder(cols=['cat1', 'cat2']) result = enc.fit_transform(pd.DataFrame(X, columns=['cat1', 'cat2']), pd.Series(y)) assert_array_almost_equal(result, pd.DataFrame(expected), decimal=2) ok_('cat1' in enc._mapping) ok_('cat2' in enc._mapping) ok_(isinstance(enc._mapping['cat1'], pd.DataFrame)) ok_(isinstance(enc._mapping['cat2'], pd.DataFrame)) assert_array_equal(enc._mapping['cat1'].index, ['a', 'b']) assert_array_equal(enc._mapping['cat2'].index, ['bar', 'foo']) assert_array_equal(enc._mapping['cat1'].columns, ['mean', 'count', 'value']) assert_array_equal(enc._mapping['cat2'].columns, ['mean', 'count', 'value'])
def test_init(self, kwargs, cols, handle_unseen, min_samples, smoothing): enc = TargetEncoder(**kwargs) eq_(enc.cols, cols) eq_(enc.handle_unseen, handle_unseen) eq_(enc.min_samples, min_samples) eq_(enc.smoothing, smoothing) eq_(enc._imputed, None) eq_(enc._mapping, {})
def test_transform_before_fit(self): enc = TargetEncoder() assert_raises(ValueError, enc.transform, 1)
def test_transform_error(self, X, y, expected): enc = TargetEncoder(cols=['cat'], handle_unseen='error') X = pd.DataFrame(X, columns=['cat']) enc.fit(X, pd.Series(y)) X.iloc[0, 0] = 'foo' assert_raises(ValueError, enc.transform, X)