def test_target_encoder_fit_column_global_mean_linear_regression(self): df = pd.DataFrame({ 'variable': [ 'positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', 'neutral', 'positive' ], 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4] }) encoder = TargetEncoder(weight=1) encoder._global_mean = 0.454545 actual = encoder._fit_column(X=df.variable, y=df.target) # expected new value: # [count of the value * its mean encoding + weight (= 1) * global mean] # / [count of the value + weight (=1)]. expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1), (4 * 0.250000 + 1 * 0.454545) / (4 + 1), (4 * 4.500000 + 1 * 0.454545) / (4 + 1)], index=["negative", "neutral", "positive"]) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected)
def test_target_encoder_fit_column_binary_classification(self): df = pd.DataFrame({ 'variable': [ 'positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', 'neutral' ], 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1] }) encoder = TargetEncoder() encoder._global_mean = 0.5 actual = encoder._fit_column(X=df.variable, y=df.target) expected = pd.Series(data=[0.333333, 0.50000, 0.666667], index=["negative", "neutral", "positive"]) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected)
def test_target_encoder_fit_column_linear_regression(self): df = pd.DataFrame({ 'variable': [ 'positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', 'neutral', 'positive' ], 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4] }) encoder = TargetEncoder() encoder._global_mean = 0.454545 actual = encoder._fit_column(X=df.variable, y=df.target) expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], index=["negative", "neutral", "positive"]) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected)
def test_target_encoder_fit_column_global_mean(self): df = pd.DataFrame({ 'variable': [ 'positive', 'positive', 'negative', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', 'neutral' ], 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1] }) encoder = TargetEncoder(weight=1) encoder._global_mean = df.target.sum() / df.target.count() # is 0.5 actual = encoder._fit_column(X=df.variable, y=df.target) expected = pd.Series(data=[0.375, 0.500, 0.625], index=["negative", "neutral", "positive"]) expected.index.name = "variable" pd.testing.assert_series_equal(actual, expected)