def test_zero_encoding_for_unseen_categories_if_errors_is_encode(): df_fit = pd.DataFrame( {"col1": ["a", "a", "b", "a", "c"], "col2": ["1", "2", "3", "1", "2"]} ) df_transform = pd.DataFrame( {"col1": ["a", "d", "b", "a", "c"], "col2": ["1", "2", "3", "1", "4"]} ) # count encoding encoder = CountFrequencyEncoder(errors="encode").fit(df_fit) result = encoder.transform(df_transform) # check that no NaNs are added assert pd.isnull(result).sum().sum() == 0 # check that the counts are correct expected_result = pd.DataFrame({"col1": [3, 0, 1, 3, 1], "col2": [2, 2, 1, 2, 0]}) pd.testing.assert_frame_equal(result, expected_result) # with frequency encoder = CountFrequencyEncoder(encoding_method="frequency", errors="encode").fit( df_fit ) result = encoder.transform(df_transform) # check that no NaNs are added assert pd.isnull(result).sum().sum() == 0 # check that the frequencies are correct expected_result = pd.DataFrame( {"col1": [0.6, 0, 0.2, 0.6, 0.2], "col2": [0.4, 0.4, 0.2, 0.4, 0]} ) pd.testing.assert_frame_equal(result, expected_result)
def test_error_if_input_df_contains_categories_not_present_in_fit_df( df_enc, df_enc_rare): # test case 3: when dataset to be transformed contains categories not present in # training dataset with pytest.warns(UserWarning): encoder = CountFrequencyEncoder() encoder.fit(df_enc) encoder.transform(df_enc_rare)
def test_no_error_triggered_when_df_contains_unseen_categories_and_errors_is_encode( df_enc, df_enc_rare ): # dataset to be transformed contains categories not present in # training dataset (unseen categories). # check for no error and no warning when errors equals 'encode' warnings.simplefilter("error") encoder = CountFrequencyEncoder(errors="encode") encoder.fit(df_enc) with warnings.catch_warnings(): encoder.transform(df_enc_rare)
def test_warning_when_df_contains_unseen_categories(df_enc, df_enc_rare): # dataset to be transformed contains categories not present in # training dataset (unseen categories), errors set to ignore. msg = "During the encoding, NaN values were introduced in the feature(s) var_A." # check for warning when errors equals 'ignore' encoder = CountFrequencyEncoder(errors="ignore") encoder.fit(df_enc) with pytest.warns(UserWarning) as record: encoder.transform(df_enc_rare) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[0] == msg
def test_error_when_df_contains_unseen_categories(df_enc, df_enc_rare): # dataset to be transformed contains categories not present in # training dataset (unseen categories), errors set to raise. msg = "During the encoding, NaN values were introduced in the feature(s) var_A." encoder = CountFrequencyEncoder(errors="raise") encoder.fit(df_enc) # check for exception when errors equals 'raise' with pytest.raises(ValueError) as record: encoder.transform(df_enc_rare) # check that the error message matches assert str(record.value) == msg # check for no error and no warning when errors equals 'encode' with warnings.catch_warnings(): warnings.simplefilter("error") encoder = CountFrequencyEncoder(errors="encode") encoder.fit(df_enc) encoder.transform(df_enc_rare)
def test_nan_encoding_for_new_categories_if_errors_is_ignore(): df_fit = pd.DataFrame( {"col1": ["a", "a", "b", "a", "c"], "col2": ["1", "2", "3", "1", "2"]} ) df_transf = pd.DataFrame( {"col1": ["a", "d", "b", "a", "c"], "col2": ["1", "2", "3", "1", "4"]} ) encoder = CountFrequencyEncoder(errors="ignore").fit(df_fit) result = encoder.transform(df_transf) # check that no NaNs are added assert pd.isnull(result).sum().sum() == 2 # check that the counts are correct for both new and old expected_result = pd.DataFrame( {"col1": [3, nan, 1, 3, 1], "col2": [2, 2, 1, 2, nan]} ) pd.testing.assert_frame_equal(result, expected_result)
def test_raises_non_fitted_error(df_enc): with pytest.raises(NotFittedError): encoder = CountFrequencyEncoder() encoder.transform(df_enc)
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): encoder = CountFrequencyEncoder() encoder.fit(df_enc) encoder.transform(df_enc_na)
def convertFromField36ToField40WithCountFrequencyEncoder(array): dataFrame = pd.DataFrame(array[:, 44:49]) encoder = CountFrequencyEncoder(encoding_method='frequency') encoder.fit(dataFrame) encodedField36Field40 = encoder.transform(dataFrame) array[:, 44:49] = encodedField36Field40