def test_invariance_to_data_types(self): x = np.array([ ['a', 'b', 'c'], ['a', 'b', 'c'], ['b', 'b', 'c'], ['b', 'b', 'b'], ['b', 'b', 'b'], ['a', 'b', 'a'], ]) y = [1, 2, 3, 3, 3, 3] wrapper = PolynomialWrapper(encoders.TargetEncoder()) result = wrapper.fit_transform(x, y) th.verify_numeric(result) x = pd.DataFrame([ ['a', 'b', 'c'], ['a', 'b', 'c'], ['b', 'b', 'c'], ['b', 'b', 'b'], ['b', 'b', 'b'], ['a', 'b', 'a'], ], columns=['f1', 'f2', 'f3']) y = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog'] wrapper = PolynomialWrapper(encoders.TargetEncoder()) result2 = wrapper.fit_transform(x, y) self.assertTrue((result.values == result2.values).all( ), 'The content should be the same regardless whether we pass Numpy or Pandas data type.' )
def test_np(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): # Encode a numpy array enc = getattr(encoders, encoder_name)() enc.fit(np_X, np_y) th.verify_numeric(enc.transform(np_X_t))
def test_binary(self): cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int' ] enc = encoders.GLMMEncoder(cols=cols, binomial_target=True) enc.fit(X, np_y) th.verify_numeric(enc.transform(X))
def test_target_encoder(self): enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2) enc.fit(X, y) th.verify_numeric(enc.transform(X_t)) th.verify_numeric(enc.transform(X_t, y_t))
def test_pandas_categorical(self): X = pd.DataFrame({ 'Str': ['a', 'c', 'c', 'd'], 'Categorical': pd.Categorical(list('bbea'), categories=['e', 'a', 'b'], ordered=True) }) enc = encoders.OrdinalEncoder() out = enc.fit_transform(X) th.verify_numeric(out) self.assertEqual(3, out['Categorical'][0]) self.assertEqual(3, out['Categorical'][1]) self.assertEqual(1, out['Categorical'][2]) self.assertEqual(2, out['Categorical'][3])
def test_impact_encoders(self): for encoder_name in ['LeaveOneOutEncoder', 'TargetEncoder', 'WOEEncoder', 'MEstimateEncoder', 'JamesSteinEncoder', 'CatBoostEncoder', 'GLMMEncoder']: with self.subTest(encoder_name=encoder_name): # encode a numpy array and transform with the help of the target enc = getattr(encoders, encoder_name)() enc.fit(np_X, np_y) th.verify_numeric(enc.transform(np_X_t, np_y_t)) # target is a DataFrame enc = getattr(encoders, encoder_name)() enc.fit(X, y) th.verify_numeric(enc.transform(X_t, y_t)) # when we run transform(X, y) and there is a new value in X, something is wrong and we raise an error enc = getattr(encoders, encoder_name)(handle_unknown='error', cols=['extra']) enc.fit(X, y) self.assertRaises(ValueError, enc.transform, (X_t, y_t))
def test_classification(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int' ] enc = getattr(encoders, encoder_name)(cols=cols) enc.fit(X, np_y) th.verify_numeric(enc.transform(X_t)) enc = getattr(encoders, encoder_name)(verbose=1) enc.fit(X, np_y) th.verify_numeric(enc.transform(X_t)) enc = getattr(encoders, encoder_name)(drop_invariant=True) enc.fit(X, np_y) th.verify_numeric(enc.transform(X_t)) enc = getattr(encoders, encoder_name)(return_df=False) enc.fit(X, np_y) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray)) self.assertEqual( enc.transform(X_t).shape[0], X_t.shape[0], 'Row count must not change')
def test_is_numeric_numpy(self): # Whole numbers, regardless of the byte length, should not raise AssertionError X = np.ones([5, 5], dtype='int32') verify_numeric(pd.DataFrame(X)) X = np.ones([5, 5], dtype='int64') verify_numeric(pd.DataFrame(X)) # Floats X = np.ones([5, 5], dtype='float32') verify_numeric(pd.DataFrame(X)) X = np.ones([5, 5], dtype='float64') verify_numeric(pd.DataFrame(X))
def test_is_numeric_pandas(self): # Whole numbers, regardless of the byte length, should not raise AssertionError X = pd.DataFrame(np.ones([5, 5]), dtype='int32') verify_numeric(pd.DataFrame(X)) X = pd.DataFrame(np.ones([5, 5]), dtype='int64') verify_numeric(pd.DataFrame(X)) # Strings should raise AssertionError X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']]) with self.assertRaises(Exception): verify_numeric(pd.DataFrame(X))
def test_woe(self): cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int' ] # balanced label with balanced features X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col1']) y_balanced = [True, False, True, False, True, False] enc = encoders.WOEEncoder() enc.fit(X_balanced, y_balanced) X1 = enc.transform(X_balanced) self.assertTrue( all(X1.sum() < 0.001), "When the class label is balanced, WoE should sum to 0 in each transformed column" ) enc = encoders.WOEEncoder(cols=cols) enc.fit(X, np_y) X1 = enc.transform(X_t) th.verify_numeric(X1[cols]) self.assertTrue( np.isfinite(X1[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X1), 'The count of rows must not change') X2 = enc.transform(X_t, np_y_t) th.verify_numeric(X2) self.assertTrue( np.isfinite(X2[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') X3 = enc.transform(X, np_y) th.verify_numeric(X3) self.assertTrue( np.isfinite(X3[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X)), len(list(X3)), 'The count of attributes must not change') self.assertEqual(len(X), len(X3), 'The count of rows must not change') self.assertTrue( X3['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label') X4 = enc.fit_transform(X, np_y) th.verify_numeric(X4) self.assertTrue( np.isfinite(X4[cols].values).all(), 'There must not be any NaN, inf or -inf in the transformed columns' ) self.assertEqual(len(list(X)), len(list(X4)), 'The count of attributes must not change') self.assertEqual(len(X), len(X4), 'The count of rows must not change') self.assertTrue( X4['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label') enc = encoders.WOEEncoder() enc.fit(X, np_y) X1 = enc.transform(X_t) self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X1), 'The count of rows must not change') th.verify_numeric(X1) X2 = enc.transform(X_t, np_y_t) th.verify_numeric(X2) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') # seed enc = encoders.WOEEncoder(cols=cols, random_state=2001, randomized=True) enc.fit(X, np_y) X1 = enc.transform(X_t, np_y_t) X2 = enc.transform(X_t, np_y_t) self.assertTrue( X1.equals(X2), "When the seed is given, the results must be identical") th.verify_numeric(X1) th.verify_numeric(X2) # invariant target y_invariant = [True, True, True, True, True, True] enc = encoders.WOEEncoder() with self.assertRaises(ValueError): enc.fit(X_balanced, y_invariant) # branch coverage unit tests - no cols enc = encoders.WOEEncoder(cols=[]) enc.fit(X, np_y) self.assertTrue(enc.transform(X_t).equals(X_t)) # missing values in the target y_missing = [True, True, None, True, True, True] enc = encoders.WOEEncoder() with self.assertRaises(ValueError): enc.fit(X_balanced, y_missing) # impute missing enc = encoders.WOEEncoder(handle_missing='return_nan') enc.fit(X, np_y) X1 = enc.transform(X_t) th.verify_numeric(X1) self.assertTrue(X1.isnull().values.any()) self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X1), 'The count of rows must not change') X2 = enc.transform(X_t, np_y_t) th.verify_numeric(X2) self.assertTrue(X1.isnull().values.any()) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change')
def test_verify_raises_AssertionError_on_categories(self): # Categories should raise AssertionError X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']], dtype='category') with self.assertRaises(Exception): verify_numeric(pd.DataFrame(X))
for index in range(num): rsl = [encoder_name, index + 1, X.shape] if encoder_name == 'HashingEncoder': enc = encoders.HashingEncoder(max_process=index + 1, cols=cols) else: enc = getattr(encoders, encoder_name)(cols=cols) t = [] c = [] for _ in range(benchmark_repeat): start = time.time() proc = multiprocessing.Process(target=get_cpu_utilization, args=()) proc.start() enc.fit(X, np_y) th.verify_numeric(enc.transform(X_t)) end = time.time() proc.terminate() proc.join() cost = [] while not cpu_utilization.empty(): cost.append(cpu_utilization.get()) t.append(end - start) c.append(np.mean(cost)) rsl.append(min(t)) rsl.append(np.mean(t)) rsl.append(max(c)) rsl.append(np.mean(c)) results.append(rsl) print(rsl)
def test_leave_one_out(self): enc = encoders.LeaveOneOutEncoder(verbose=1, sigma=0.1) enc.fit(X, y) th.verify_numeric(enc.transform(X_t)) th.verify_numeric(enc.transform(X_t, y_t))