def test_transform_can_be_applied_to_training_frame_with_special_flag(): ds = load_dataset() te = H2OTargetEncoderEstimator() te.train(y=ds.target, training_frame=ds.train) transformed_as_training = te.transform(ds.train, as_training=True) transformed = te.transform(ds.train) assert pu.compare_frames(transformed, transformed_as_training, 0, tol_numeric=1e-5) # now with non default params te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", blending=True, inflection_point=5, smoothing=17, seed=seed, noise=0.01) te_nd.train(y=ds.target, training_frame=ds.train) transformed_as_training = te_nd.transform(ds.train, as_training=True) transformed = te_nd.transform(ds.train) try: assert pu.compare_frames(transformed, transformed_as_training, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae)
def test_deprecated_k_param_is_alias_for_inflection_point(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(noise=0) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) # print(encoded) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") te_k = H2OTargetEncoderEstimator(noise=0, k=5, blending=True) assert len(w) == 1 assert issubclass(w[0].category, H2ODeprecationWarning) assert "``k`` param of ``{}`` is deprecated".format( te_init_name) in str(w[0].message) te_k.train(y=ds.target, training_frame=ds.train) encoded_k = te_k.predict(ds.test) # print(encoded_k) te_ip = H2OTargetEncoderEstimator(noise=0, inflection_point=5, blending=True) te_ip.train(y=ds.target, training_frame=ds.train) encoded_ip = te_ip.predict(ds.test) # print(encoded_ip) try: pu.compare_frames(encoded_k, encoded, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae) assert pu.compare_frames(encoded_k, encoded_ip, 0, tol_numeric=1e-5)
def test_deprecated_noise_level_param_is_alias_for_noise(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator() te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) # print(encoded) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") te_nl = H2OTargetEncoderEstimator(noise_level=0) assert len(w) == 1 assert issubclass(w[0].category, H2ODeprecationWarning) assert "``noise_level`` param of ``{}`` is deprecated".format( te_init_name) in str(w[0].message) te_nl.train(y=ds.target, training_frame=ds.train) encoded_nl = te_nl.predict(ds.test) # print(encoded_nl) te_n = H2OTargetEncoderEstimator(noise=0) te_n.train(y=ds.target, training_frame=ds.train) encoded_n = te_n.predict(ds.test) # print(encoded_n) try: pu.compare_frames(encoded_nl, encoded, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae) assert pu.compare_frames(encoded_nl, encoded_n, 0, tol_numeric=1e-5)
def test_target_encoding_fit_method(): print("Check fit method of the TargetEncoder class") targetColumnName = "survived" foldColumnName = "kfold_column" teColumns = ["home.dest", "cabin", "embarked"] trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) te = H2OTargetEncoderEstimator(k=0.7, f=0.3, data_leakage_handling="none") te.train(training_frame=trainingFrame, encoded_columns=teColumns, target_column=targetColumnName) print(te) transformed = te.transform(frame=trainingFrame) assert transformed is not None print(transformed.names) assert transformed.ncols == trainingFrame.ncols + len(teColumns) for te_col in teColumns: assert te_col + "_te" in transformed.names assert transformed.nrows == 1309 # Test fold_column proper handling + kfold data leakage strategy defined te = H2OTargetEncoderEstimator(k=0.7, f=0.3) te.train(training_frame=trainingFrame, fold_column="pclass", target_column=targetColumnName, encoded_columns=teColumns) transformed = te.transform(trainingFrame, data_leakage_handling="kfold", seed=1234) te.train(training_frame=trainingFrame, fold_column="pclass", target_column=targetColumnName, encoded_columns=teColumns) assert transformed is not None assert transformed.nrows == 1309 # Test MOJO download mojo_file = te.download_mojo(tempfile.mkdtemp()) assert os.path.isfile(mojo_file) assert os.path.getsize(mojo_file) > 0 # Argument check te.train(training_frame=trainingFrame, fold_column="pclass", y=targetColumnName, x=teColumns)
def test_default_strategy_is_none(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(noise=0) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) te_none = H2OTargetEncoderEstimator(data_leakage_handling="none", noise=0) te_none.train(y=ds.target, training_frame=ds.train) encoded_none = te_none.predict(ds.test) assert pu.compare_frames(encoded, encoded_none, 0, tol_numeric=1e-5)
def test_target_encoded_frame_does_not_contain_fold_column(): print("Check that attached TargetEncoderModel is being used during training and scoring") targetColumnName = "survived" foldColumnName = "kfold_column" teColumns = ["cabin", "embarked"] trainingFrame = h2o.import_file(pu.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) te = H2OTargetEncoderEstimator(inflection_point=0.7, smoothing=0.3, data_leakage_handling="KFold", fold_column=foldColumnName, seed=1234) te.train(training_frame=trainingFrame, x=teColumns, y=targetColumnName) model_summary = te._model_json['output']['model_summary'].as_data_frame() encoded_column_names = model_summary['encoded_column_name'] # Checking that we don't have empty entries in TwoDim table assert len(model_summary) == 2 encoded_columns_with_te_suffix = model_summary[encoded_column_names.str.contains('_te')] assert len(encoded_columns_with_te_suffix) == 2 transformed = te.transform(trainingFrame, as_training=True) # Checking that fold column is not being encoded. assert foldColumnName+"_te" not in transformed.col_names
def test_transform_can_override_blending_parameters(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(noise=0) te.train(y=ds.target, training_frame=ds.train) transformed = te.transform(ds.test) transformed_blending = te.transform(ds.test, blending=True) try: assert pu.compare_frames(transformed, transformed_blending, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae) transformed_blending_custom = te.transform(ds.test, blending=True, inflection_point=3, smoothing=17) try: assert pu.compare_frames(transformed_blending_custom, transformed_blending, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae)
def test_strategies_produce_same_results_when_applied_on_new_data(): ds = load_dataset(incl_test=True, incl_foldc=True) te_none = H2OTargetEncoderEstimator(noise=0) te_none.train(y=ds.target, training_frame=ds.train) encoded_none = te_none.transform(ds.test) te_loo = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", noise=0) te_loo.train(y=ds.target, training_frame=ds.train) encoded_loo = te_loo.transform(ds.test) te_kfold = H2OTargetEncoderEstimator(data_leakage_handling="kfold", noise=0) te_kfold.train(y=ds.target, training_frame=ds.train, fold_column='foldc') encoded_kfold = te_kfold.transform(ds.test) for l, r in itertools.combinations( [encoded_none, encoded_loo, encoded_kfold], 2): assert pu.compare_frames(l, r, 0, tol_numeric=1e-2)
def test_fold_column_is_not_encoded(): ds = load_dataset(incl_foldc=True) te = H2OTargetEncoderEstimator(data_leakage_handling="none") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") encoded = te.predict(ds.train) assert "foldc" in encoded.names assert "foldc_te" not in encoded.names te = H2OTargetEncoderEstimator(data_leakage_handling="kfold") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") encoded = te.predict(ds.train) assert "foldc" in encoded.names assert "foldc_te" not in encoded.names te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") encoded = te.predict(ds.train) assert "foldc" in encoded.names assert "foldc_te" not in encoded.names
def test_transform_produces_the_same_result_as_predict_by_default(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator() te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) transformed = te.transform(ds.test) assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5) # now with non default params te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", blending=True, inflection_point=5, smoothing=17, seed=seed, noise=0.01) te_nd.train(y=ds.target, training_frame=ds.train) encoded = te_nd.predict(ds.test) transformed = te_nd.transform(ds.test) assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5)
def test_encoding_fails_if_there_is_no_categorical_column_to_encode(): ds = load_dataset() non_cat = {n for n, t in ds.train.types.items() if t in ['int', 'real']} to_encode = non_cat assert len(to_encode) > 0 te = H2OTargetEncoderEstimator() try: te.train(x=to_encode, y=ds.target, training_frame=ds.train) assert False, "should have raised error" except H2OResponseError as e: assert "Training data must have at least 2 features (incl. response)" in str( e)
def test_strategies_produce_different_results_for_training(): ds = load_dataset(incl_foldc=True) te_none = H2OTargetEncoderEstimator(noise=0) te_none.train(y=ds.target, training_frame=ds.train) encoded_none = te_none.transform(ds.train, as_training=True) te_loo = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", noise=0) te_loo.train(y=ds.target, training_frame=ds.train) encoded_loo = te_loo.transform(ds.train, as_training=True) te_kfold = H2OTargetEncoderEstimator(data_leakage_handling="kfold", noise=0) te_kfold.train(y=ds.target, training_frame=ds.train, fold_column='foldc') encoded_kfold = te_kfold.transform(ds.train, as_training=True) for l, r in itertools.combinations( [encoded_none, encoded_loo, encoded_kfold], 2): try: assert pu.compare_frames(l, r, 0, tol_numeric=1e-2) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae)
def test_kfold_requires_fold_column(): ds = load_dataset(incl_foldc=True) te = H2OTargetEncoderEstimator(data_leakage_handling="kfold") try: te.train(y=ds.target, training_frame=ds.train) assert False, "should have raised" except Exception as e: assert "Fold column is required when using KFold leakage handling strategy" in str( e) te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") assert te.predict(ds.train) is not None
def test_columns_to_encode_can_be_listed_in_dedicated_param(): ds = load_dataset(incl_test=True) categoricals = {n for n, t in ds.train.types.items() if t == 'enum'} - {ds.target} to_encode = {c for i, c in enumerate(categoricals) if i % 2} assert len(to_encode) > 0 te = H2OTargetEncoderEstimator(columns_to_encode=list(to_encode)) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) te_cols = [c for c in encoded.names if c.endswith("_te")] assert len(te_cols) == len(to_encode) assert {"{}_te".format(n) for n in to_encode} == set(te_cols)
def test_regression_with_kfold(): ds = load_dataset(incl_foldc=True) te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") encoded = te.transform(ds.train, as_training=True) print(encoded) col_te_golden = [45.05575, 24.68343, 45.00326, 27.65044, 45.00326] col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape(-1).tolist() assert_allclose(col_te, col_te_golden, atol=1e-5) # with open("./golden/regression_kfold.csv", "w") as f: # f.write(encoded.get_frame_data()) golden = h2o.import_file("./golden/regression_kfold.csv") assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_original_features_can_be_automatically_removed_from_result_frame(): target = "survived" teColumns = ["cabin", "embarked"] trainingFrame = h2o.import_file(pu.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[target] = trainingFrame[target].asfactor() te = H2OTargetEncoderEstimator(keep_original_categorical_columns=False) te.train(training_frame=trainingFrame, x=teColumns, y=target) transformed = te.transform(trainingFrame) for col in teColumns: assert "{}_te".format(col) in transformed.names assert col not in transformed.names
def test_original_features_are_kept_by_default(): target = "survived" teColumns = ["cabin", "embarked"] trainingFrame = h2o.import_file(pu.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[target] = trainingFrame[target].asfactor() te = H2OTargetEncoderEstimator() te.train(training_frame=trainingFrame, x=teColumns, y=target) transformed = te.transform(trainingFrame) for col in teColumns: assert "{}_te".format(col) in transformed.names assert col in transformed.names
def test_loo_requires_target_to_encode_training_frame(): ds = load_dataset() te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out") te.train(y=ds.target, training_frame=ds.train) train_no_target = h2o.assign(ds.train.drop(ds.target), "train_no_target") assert train_no_target is not None try: te.transform(train_no_target, as_training=True) assert False, "should have raised" except Exception as e: assert "LeaveOneOut strategy requires a response column" in str(e) assert te.predict(train_no_target) is not None
def test_regression_with_loo(): ds = load_dataset() te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="leave_one_out") te.train(y=ds.target, training_frame=ds.train) encoded = te.transform(ds.train, as_training=True) print(encoded) col_te_golden = [45.84229, 25.99816, 45.97086, 25.99816, 45.97086] col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape(-1).tolist() assert_allclose(col_te, col_te_golden, atol=1e-5) # with open("./golden/regression_loo.csv", "w") as f: # f.write(encoded.get_frame_data()) golden = h2o.import_file("./golden/regression_loo.csv") assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_te_model_does_nothing_if_there_is_no_categorical_column_to_encode(): ds = load_dataset() non_cat = {n for n, t in ds.train.types.items() if t in ['int', 'real']} to_encode = non_cat assert len(to_encode) > 0 te = H2OTargetEncoderEstimator() te.train(x=to_encode, y=ds.target, training_frame=ds.train) transformed = te.transform(ds.train) assert transformed.names == ds.train.names assert transformed.key == ds.train.key encoded = te.predict(ds.train) assert encoded.names == ds.train.names assert encoded.key != ds.train.key
def columns_listed_in_columns_to_encode_should_not_be_ignored_in_x(): ds = load_dataset(incl_test=True) categoricals = list({n for n, t in ds.train.types.items() if t == 'enum'} - {ds.target}) assert len(categoricals) > 3 ignored = categoricals[0] two_inter = [ignored, categoricals[1]] te = H2OTargetEncoderEstimator(columns_to_encode=[two_inter]) x = list(set(ds.train.names) - {ignored}) try: te.train(x=x, y=ds.target, training_frame=ds.train) except Exception as e: assert "Column `{}` from interaction [{}] is not categorical or is missing from the training frame".format( ignored, ', '.join(two_inter)) in str(e)
def test_all_categoricals_are_encoded_by_default(): ds = load_dataset(incl_test=True) categoricals = {n for n, t in ds.train.types.items() if t == 'enum'} - {ds.target} assert len(categoricals) > 0 te = H2OTargetEncoderEstimator() te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) assert {"{}_te".format(n) for n in categoricals} < set( encoded.names), "some categoricals haven't been encoded" assert set(ds.train.names) < set( encoded.names ), "some original columns have been removed from predictions"
def test_multinomial_with_loo(): ds = load_dataset() te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="leave_one_out") te.train(y=ds.target, training_frame=ds.train) encoded = te.transform(ds.train, as_training=True) print(encoded) col_te_golden = [0.22796, 0.20309, 0.22796, 0.20309, 0.22796] col_te = encoded['sex_Class_2_te'].head(5).as_data_frame().values.reshape(-1).tolist() assert_allclose(col_te, col_te_golden, atol=1e-5) # with open("{}/golden/multinomial_loo.csv".format(here), "w") as f: # f.write(encoded.get_frame_data()) golden = h2o.import_file("{}/golden/multinomial_loo.csv".format(here)) assert golden.names == encoded.names assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_multinomial_with_kfold(): ds = load_dataset(incl_foldc=True) te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") encoded = te.transform(ds.train, as_training=True) print(encoded) col_te_golden = [0.22300, 0.20857, 0.23127, 0.19478, 0.23127] col_te = encoded['sex_Class_2_te'].head(5).as_data_frame().values.reshape(-1).tolist() assert_allclose(col_te, col_te_golden, atol=1e-5) # with open("{}/golden/multinomial_kfold.csv".format(here), "w") as f: # f.write(encoded.get_frame_data()) golden = h2o.import_file("{}/golden/multinomial_kfold.csv".format(here)) assert golden.names == encoded.names assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_binomial_with_kfold(): ds = load_dataset(incl_foldc=True) te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") encoded = te.transform(ds.train, as_training=True) print(encoded) col_te_golden = [0.714286, 0.178771, 0.729642, 0.208696, 0.729642] col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape( -1).tolist() assert_allclose(col_te, col_te_golden, atol=1e-5) # with open("./golden/binomial_kfold.csv", "w") as f: # f.write(encoded.get_frame_data()) golden = h2o.import_file("./golden/binomial_kfold.csv") assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_binomial_with_none(): ds = load_dataset() te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="none") te.train(y=ds.target, training_frame=ds.train) encoded = te.transform(ds.train, as_training=True) print(encoded) col_te_golden = [0.72747, 0.19099, 0.72747, 0.19099, 0.72747] col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape( -1).tolist() assert_allclose(col_te, col_te_golden, atol=1e-5) # with open("./golden/binomial_none.csv", "w") as f: # f.write(encoded.get_frame_data()) golden = h2o.import_file("./golden/binomial_none.csv") assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_regression_with_none(): ds = load_dataset() te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="none") te.train(y=ds.target, training_frame=ds.train) encoded = te.transform(ds.train, as_training=True) print(encoded) col_te_golden = [46.19810, 26.14816, 46.19810, 26.14816, 46.19809] col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape( -1).tolist() assert_allclose(col_te, col_te_golden, atol=1e-5) # with open("{}/golden/regression_none.csv".format(here), "w") as f: # f.write(encoded.get_frame_data()) golden = h2o.import_file("{}/golden/regression_none.csv".format(here)) assert golden.names == encoded.names assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_use_kfold_strategy_to_train_a_model_with_cv(): #XXX: TE KFold strategy allows TE to be trained only once in a context of model building with CV, # but it can't be applied just once on the training data, # otherwise this is what's happening when training CV1 for example (fold1 = cv_holdout, f2-n = cv_train): # column `cat_te` for cv_holdout is obtained using fold_1 so, only with information collected from folds_2-n, which is what we want. # column `cat_te` for cv_train however is obtained using fold_i, and each of those contains information about fold_1: this is a data leakage from cv_holdout into cv_train. # on top of this, current version of transform is using a global priorMean for NAs, creating an additional data leakage in CV context. # The priorMean issue can be fixed internally in the implementation of KFold strategy. # However, for proper CCV, we need a deep integration with CV logic in ModelBuilder (translate to Java of course..): # train TE using KFold strategy on the entire train set. # then during CV, for each fold: # train_cv_i = te.transform(train_cv, fold=fold_i) # so that train_cv_i is not encoded at all with encodings from other folds (they include info about current fold) # test_cv_i = te.transform(test_cv, fold=fold_i) # same # finally, the final model is trained with TE applied on the whole training frame: # train = te.transform(train, as_training=True) # still using the fold column, this ensures that the final feature is equivalent to the one used in all the test_cv_i # or # train = te.transform(train) # ignoring the fold column, this way the final te feature uses the entire train set. ds = load_dataset(incl_test=True, incl_foldc=True) te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold") te.train(y=ds.target, training_frame=ds.train, fold_column="foldc") train_enc_cv = te.transform(ds.train, as_training=True) cols_to_remove = [n[:-3] for n in train_enc_cv.names if n.endswith("_te")] train_enc_cv = h2o.assign(train_enc_cv.drop(cols_to_remove), "train_enc_cv") train_enc_no_cv = te.transform(ds.train) train_enc_no_cv = h2o.assign(train_enc_no_cv.drop(cols_to_remove), "train_enc_no_cv") test_enc = te.transform(ds.test) test_enc = h2o.assign(test_enc.drop(cols_to_remove), "test_enc") print(train_enc_cv) print(train_enc_no_cv) gbm = H2OGradientBoostingEstimator(seed=seed) gbm.train(y=ds.target, training_frame=train_enc_cv, fold_column="foldc") auc_with_ccv = gbm.model_performance(test_enc).auc() print("AUC with CCV : %s" % auc_with_ccv) gbm.train(y=ds.target, training_frame=train_enc_no_cv, fold_column="foldc") auc_no_ccv = gbm.model_performance(test_enc).auc() print("AUC without CCV : %s" % auc_no_ccv) assert auc_with_ccv > auc_no_ccv
def test_transform_seed_param_raise_warning(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(seed=42) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) transformed_1 = te.transform(ds.test) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") transformed_2 = te.transform(ds.test, seed=24) assert len(w) == 1 assert issubclass(w[0].category, H2ODeprecationWarning) assert "`seed` is deprecated in `transform` method and will be ignored" in str( w[0].message) assert pu.compare_frames(encoded, transformed_1, 0, tol_numeric=1e-5) assert pu.compare_frames(encoded, transformed_2, 0, tol_numeric=1e-5)
def test_columns_groups_are_encoded_as_a_single_interaction(): ds = load_dataset(incl_test=True) categoricals = list({n for n, t in ds.train.types.items() if t == 'enum'} - {ds.target}) assert len(categoricals) > 3 no_inter = categoricals[0] two_inter = [categoricals[0], categoricals[1]] three_inter = [categoricals[0], categoricals[1], categoricals[2]] te = H2OTargetEncoderEstimator( columns_to_encode=[no_inter, two_inter, three_inter]) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) te_cols = [c for c in encoded.names if c.endswith("_te")] assert len(te_cols) == 3 assert "{}_te".format(no_inter) in te_cols assert "{}:{}_te".format(*two_inter) in te_cols assert "{}:{}:{}_te".format(*three_inter) in te_cols