def test_targetencoder_multi_column(): """ Test jointly encoding multiple columns """ train = cudf.DataFrame({ 'cat_1': ['a', 'b', 'b', 'a', 'a', 'b'], 'cat_2': [1, 1, 2, 2, 1, 2], 'label': [1, 0, 1, 1, 0, 1] }) test = cudf.DataFrame({ 'cat_1': ['b', 'b', 'a', 'b'], 'cat_2': [1, 2, 1, 2] }) encoder = TargetEncoder() train_encoded = encoder.fit_transform(train[['cat_1', 'cat_2']], train.label) test_encoded = encoder.transform(test[['cat_1', 'cat_2']]) train_answer = np.array([2. / 3, 2. / 3, 1., 2. / 3, 2. / 3, 1.]) test_answer = np.array([0., 1., 0.5, 1.]) assert array_equal(train_encoded, train_answer) assert array_equal(test_encoded, test_answer) encoder = TargetEncoder() encoder.fit(train[['cat_1', 'cat_2']], train.label) train_encoded = encoder.transform(train[['cat_1', 'cat_2']]) test_encoded = encoder.transform(test[['cat_1', 'cat_2']]) assert array_equal(train_encoded, train_answer) assert array_equal(test_encoded, test_answer)
def test_transform_with_index(): df = cudf.DataFrame({ "a": [1, 1, 2, 3], "b": [True, False, False, True] }, index=[9, 4, 5, 3]) t_enc = TargetEncoder() t_enc.fit(df.a, y=df.b) train_encoded = t_enc.transform(df.a) ans = cp.asarray([0, 1, 0.5, 0.5]) assert array_equal(train_encoded, ans) train_encoded = t_enc.transform(df[["a"]]) assert array_equal(train_encoded, ans)
def test_targetencoder_transform(): train = cudf.DataFrame({ 'category': ['a', 'b', 'b', 'a'], 'label': [1, 0, 1, 1] }) test = cudf.DataFrame({'category': ['b', 'b', 'a', 'b']}) encoder = TargetEncoder() encoder.fit_transform(train.category, train.label) test_encoded = encoder.transform(test.category) answer = np.array([0.5, 0.5, 1., 0.5]) assert array_equal(test_encoded, answer) encoder = TargetEncoder() encoder.fit(train.category, train.label) test_encoded = encoder.transform(test.category) assert array_equal(test_encoded, answer)
def test_targetencoder_newly_encountered(): """ Note that there are newly-encountered values in test, namely, 'c' and 'd'. """ train = cudf.DataFrame({ 'category': ['a', 'b', 'b', 'a'], 'label': [1, 0, 1, 1] }) test = cudf.DataFrame({'category': ['c', 'b', 'a', 'd']}) encoder = TargetEncoder() encoder.fit_transform(train.category, train.label) test_encoded = encoder.transform(test.category) answer = np.array([0.75, 0.5, 1., 0.75]) assert array_equal(test_encoded, answer) encoder = TargetEncoder() encoder.fit(train.category, train.label) test_encoded = encoder.transform(test.category) assert array_equal(test_encoded, answer)
def test_targetencoder_cupy(): """ Note that there are newly-encountered values in x_test, namely, 3 and 4. """ x_train = cp.array([1, 2, 2, 1]) y_train = cp.array([1, 0, 1, 1]) x_test = cp.array([1, 2, 3, 4]) encoder = TargetEncoder() encoder.fit_transform(x_train, y_train) test_encoded = encoder.transform(x_test) answer = np.array([1., 0.5, 0.75, 0.75]) assert array_equal(test_encoded, answer) print(type(test_encoded)) assert isinstance(test_encoded, cp.ndarray)
def test_one_category(): train = cudf.DataFrame({ 'category': ['a', 'a', 'a', 'a'], 'label': [3, 0, 0, 3] }) test = cudf.DataFrame({'category': ['c', 'b', 'a', 'd']}) encoder = TargetEncoder() train_encoded = encoder.fit_transform(train.category, train.label) answer = np.array([1., 2., 2., 1.]) assert array_equal(train_encoded, answer) test_encoded = encoder.transform(test.category) answer = np.array([1.5, 1.5, 1.5, 1.5]) assert array_equal(test_encoded, answer)
def test_targetencoder_var(): train = cudf.DataFrame({ 'category': ['a', 'b', 'b', 'b'], 'label': [1, 0, 1, 1] }) encoder = TargetEncoder(stat='var') train_encoded = encoder.fit_transform(train.category, train.label) answer = np.array([.25, 0., .5, .5]) assert array_equal(train_encoded, answer) encoder = TargetEncoder(stat='var') encoder.fit(train.category, train.label) train_encoded = encoder.transform(train.category) assert array_equal(train_encoded, answer)
def test_targetencoder_random(n_samples, dtype): x = cp.random.randint(0, 1000, n_samples).astype(dtype) y = cp.random.randint(0, 2, n_samples).astype(dtype) xt = cp.random.randint(0, 1000, n_samples).astype(dtype) encoder = TargetEncoder() encoder.fit_transform(x, y) test_encoded = encoder.transform(xt) df_train = cudf.DataFrame({'x': x, 'y': y}) dg = df_train.groupby('x', as_index=False).agg({'y': 'mean'}) df_test = cudf.DataFrame({'x': xt}) df_test['row_id'] = cp.arange(len(df_test)) df_test = df_test.merge(dg, on='x', how='left') df_test = df_test.sort_values('row_id') answer = df_test['y'].fillna(cp.mean(y).item()).values assert array_equal(test_encoded, answer)
def test_targetencoder_smooth(): train = cudf.DataFrame({ 'category': ['a', 'b', 'b', 'a'], 'label': [1, 0, 1, 1] }) answers = np.array([[1., 1., 0., 1.], [0.875, 0.875, 0.375, 0.875], [0.8333, 0.8333, 0.5, 0.8333], [0.75, 0.75, 0.75, 0.75]]) smooths = [0, 1, 2, 10000] for smooth, answer in zip(smooths, answers): encoder = TargetEncoder(smooth=smooth) train_encoded = encoder.fit_transform(train.category, train.label) assert array_equal(train_encoded, answer) encoder = TargetEncoder(smooth=smooth) encoder.fit(train.category, train.label) train_encoded = encoder.transform(train.category) assert array_equal(train_encoded, answer)
def test_targetencoder_customized_fold_id(): """ use customized `fold_ids` array to split data. in this example, the 1st sample belongs to `fold 0` the 2nd and 3rd sample belongs to `fold 1` and the 4th sample belongs to `fold 2` """ train = cudf.DataFrame({ 'category': ['a', 'b', 'b', 'a'], 'label': [1, 0, 1, 1] }) fold_ids = [0, 1, 1, 2] encoder = TargetEncoder(split_method='customize') train_encoded = encoder.fit_transform(train.category, train.label, fold_ids=fold_ids) answer = np.array([1., 0.75, 0.75, 1.]) assert array_equal(train_encoded, answer) encoder = TargetEncoder(split_method='customize') encoder.fit(train.category, train.label, fold_ids=fold_ids) train_encoded = encoder.transform(train.category) assert array_equal(train_encoded, answer)
def cv( self, y_train: AoS, train_features: XDataFrame, test_features: XDataFrame, y_valid: Optional[AoS], valid_features: Optional[XDataFrame], feature_name: List[str], folds_ids: List[Tuple[np.ndarray, np.ndarray]], target_scaler: Optional[MinMaxScaler], config: dict, log: bool = True, ) -> Tuple[ List[Model], np.ndarray, np.ndarray, Optional[np.ndarray], pd.DataFrame, dict ]: # initialize valid_exists = True if valid_features is not None else False test_preds = np.zeros(len(test_features)) oof_preds = np.zeros(len(train_features)) if valid_exists: valid_preds = np.zeros(len(valid_features)) else: valid_preds = None best_iteration = 0.0 cv_score_list: List[dict] = [] models: List[Model] = [] with timer("make X"): X_train = train_features.copy() X_test = test_features.copy() X_valid = valid_features.copy() if valid_features is not None else None with timer("make y"): y = y_train.values if isinstance(y_train, pd.Series) else y_train y_valid = y_valid.values if isinstance(y_valid, pd.Series) else y_valid if config["target_encoding"]: with timer("target encoding for test"): cat_cols = config["categorical_cols"] for cat_col in cat_cols: encoder = TargetEncoder(n_folds=4, smooth=0.3) encoder.fit(X_train[cat_col], y) X_test[cat_col + "_TE"] = encoder.transform(X_test[cat_col]) feature_name.append((cat_col + "_TE")) importances = pd.DataFrame(index=feature_name) for i_fold, (trn_idx, val_idx) in enumerate(folds_ids): with timer(f"fold {i_fold}"): self.fold = i_fold with timer("get train data and valid data"): # get train data and valid data x_trn = X_train.iloc[trn_idx] y_trn = y[trn_idx] x_val = X_train.iloc[val_idx] y_val = y[val_idx] if config["target_encoding"]: with timer("target encoding"): cat_cols = config["categorical_cols"] for cat_col in cat_cols: encoder = TargetEncoder(n_folds=4, smooth=0.3) x_trn[cat_col + "_TE"] = encoder.fit_transform( x_trn[cat_col], y_trn ) x_val[cat_col + "_TE"] = encoder.transform(x_val[cat_col]) logging.info(f"train size: {x_trn.shape}, valid size: {x_val.shape}") print(f"train size: {x_trn.shape}, valid size: {x_val.shape}") with timer("get sampling"): x_trn, y_trn = get_sampling(x_trn, y_trn, config) with timer("train model"): # train model model, best_score = self.fit(x_trn, y_trn, x_val, y_val, config) cv_score_list.append(best_score) models.append(model) best_iteration += self.get_best_iteration(model) / len(folds_ids) with timer("predict oof and test"): # predict oof and test oof_preds[val_idx] = self.predict(model, x_val).reshape(-1) test_preds += self.predict(model, X_test).reshape(-1) / len( folds_ids ) if valid_exists: valid_preds += self.predict(model, valid_features).reshape( -1 ) / len(folds_ids) with timer("get feature importance"): # get feature importances importances_tmp = pd.DataFrame( self.get_feature_importance(model), columns=[f"gain_{i_fold+1}"], index=feature_name, ) importances = importances.join(importances_tmp, how="inner") # summary of feature importance feature_importance = importances.mean(axis=1) # save raw prediction self.raw_oof_preds = oof_preds self.raw_test_preds = test_preds self.raw_valid_preds = valid_preds # post_process (if you have any) y, oof_preds, test_preds, y_valid, valid_preds = self.post_process( oof_preds=oof_preds, test_preds=test_preds, valid_preds=valid_preds, y_train=y_train, y_valid=y_valid, train_features=train_features, test_features=test_features, valid_features=valid_features, target_scaler=target_scaler, config=config, ) # print oof score oof_score = calc_metric(y, oof_preds) print(f"oof score: {oof_score:.5f}") if valid_exists: valid_score = calc_metric(y_valid, valid_preds) print(f"valid score: {valid_score:.5f}") if log: logging.info(f"oof score: {oof_score:.5f}") if valid_exists: logging.info(f"valid score: {valid_score:.5f}") evals_results = { "evals_result": { "oof_score": oof_score, "cv_score": { f"cv{i + 1}": cv_score for i, cv_score in enumerate(cv_score_list) }, "n_data": np.shape(X_train)[0], "best_iteration": best_iteration, "n_features": np.shape(X_train)[1], "feature_importance": feature_importance.sort_values( ascending=False ).to_dict(), } } if valid_exists: evals_results["valid_score"] = valid_score return ( models, oof_preds, test_preds, valid_preds, feature_importance, evals_results, )