def test_classifier_sparse_input(self): clf = RGFClassifier(prefix='clf', calc_prob='Softmax') for sparse_format in (csr_matrix, csc_matrix, coo_matrix): iris_sparse = sparse_format(self.iris.data) clf.fit(iris_sparse, self.iris.target) score = clf.score(iris_sparse, self.iris.target) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_attributes(self): clf = RGFClassifier() attributes = ('estimators_', 'classes_', 'n_classes_', 'n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_', 'n_iter_') for attr in attributes: self.assertRaises(NotFittedError, getattr, clf, attr) clf.fit(self.X_train, self.y_train) self.assertEqual(len(clf.estimators_), len(np.unique(self.y_train))) np.testing.assert_array_equal(clf.classes_, sorted(np.unique(self.y_train))) self.assertEqual(clf.n_classes_, len(clf.estimators_)) self.assertEqual(clf.n_features_, self.X_train.shape[-1]) self.assertTrue(clf.fitted_) if clf.sl2 is None: self.assertEqual(clf.sl2_, clf.l2) else: self.assertEqual(clf.sl2_, clf.sl2) if clf.min_samples_leaf < 1: self.assertLessEqual(clf.min_samples_leaf_, 0.5 * self.X_train.shape[0]) else: self.assertEqual(clf.min_samples_leaf_, clf.min_samples_leaf) if clf.n_iter is None: if clf.loss == "LS": self.assertEqual(clf.n_iter_, 10) else: self.assertEqual(clf.n_iter_, 5) else: self.assertEqual(clf.n_iter_, clf.n_iter)
def test_classifier_sparse_input(self): clf = RGFClassifier(calc_prob='softmax') for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix, sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix): iris_sparse = sparse_format(self.iris.data) clf.fit(iris_sparse, self.iris.target) score = clf.score(iris_sparse, self.iris.target) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_softmax_classifier(self): clf = RGFClassifier(calc_prob='softmax') clf.fit(self.iris.data, self.iris.target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0])) score = clf.score(self.iris.data, self.iris.target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_softmax_classifier(self): clf = RGFClassifier(prefix='clf', calc_prob='Softmax') clf.fit(self.iris.data, self.iris.target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(self.iris.target.shape[0])) score = clf.score(self.iris.data, self.iris.target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_bin_classifier(self): clf = RGFClassifier(prefix='clf') bin_target = (self.iris.target == 2).astype(int) clf.fit(self.iris.data, bin_target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0])) score = clf.score(self.iris.data, bin_target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_bin_classifier(self): clf = RGFClassifier() bin_target = (self.iris.target == 2).astype(int) clf.fit(self.iris.data, bin_target) proba_sum = clf.predict_proba(self.iris.data).sum(axis=1) np.testing.assert_almost_equal(proba_sum, np.ones(bin_target.shape[0])) score = clf.score(self.iris.data, bin_target) print('Score: {0:.5f}'.format(score)) self.assertGreater(score, 0.8, "Failed with score = {0:.5f}".format(score))
def test_string_y(self): clf = RGFClassifier() y_str = np.array(self.iris.target, dtype=str) y_str[y_str == '0'] = 'Zero' y_str[y_str == '1'] = 'One' y_str[y_str == '2'] = 'Two' clf.fit(self.iris.data, y_str) y_pred = clf.predict(self.iris.data) score = accuracy_score(y_str, y_pred) self.assertGreater(score, 0.95, "Failed with score = {0:.5f}".format(score))
def test_joblib_pickle(self): clf = RGFClassifier() clf.fit(self.X_train, self.y_train) y_pred1 = clf.predict(self.X_test) joblib.dump(clf, 'test_clf.pkl') # Remove model file _cleanup() clf2 = joblib.load('test_clf.pkl') y_pred2 = clf2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_pickle(self): clf = RGFClassifier() clf.fit(self.X_train, self.y_train) y_pred1 = clf.predict(self.X_test) s = pickle.dumps(clf) # Remove model file _cleanup() reg2 = pickle.loads(s) y_pred2 = reg2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_sample_weight(self): clf = RGFClassifier() y_pred = clf.fit(self.X_train, self.y_train).predict_proba(self.X_test) y_pred_weighted = clf.fit(self.X_train, self.y_train, np.ones(self.y_train.shape[0]) ).predict_proba(self.X_test) np.testing.assert_allclose(y_pred, y_pred_weighted) weights = np.ones(self.y_train.shape[0]) * np.nextafter(np.float32(0), np.float32(1)) weights[0] = 1 y_pred_weighted = clf.fit(self.X_train, self.y_train, weights).predict(self.X_test) np.testing.assert_equal(y_pred_weighted, np.full(self.y_test.shape[0], self.y_test[0]))
def run_rgf(): model = RGFClassifier(max_leaf=1000, algorithm="RGF", loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0) fit_model = model.fit(X_train, y_train) pred = fit_model.predict_proba(X_valid)[:, 1] pred_test = fit_model.predict_proba(X_test)[:, 1] try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e: print(str(e)) return pred, pred_test
def test_cleanup(self): clf1 = RGFClassifier() clf1.fit(self.X_train, self.y_train) clf2 = RGFClassifier() clf2.fit(self.X_train, self.y_train) self.assertNotEqual(clf1.cleanup(), 0) self.assertEqual(clf1.cleanup(), 0) for est in clf1.estimators_: glob_file = os.path.join(_get_temp_path(), est._file_prefix + "*") self.assertFalse(glob.glob(glob_file)) self.assertRaises(NotFittedError, clf1.predict, self.X_test) clf2.predict(self.X_test)
def run_rgf(): model = RGFClassifier( max_leaf=1000, algorithm="RGF", loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0 ) fit_model = model.fit( X_train, y_train ) pred = fit_model.predict_proba(X_valid)[:,1] pred_test = fit_model.predict_proba(X_test)[:,1] try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e: print(str(e)) return pred, pred_test
def test_params(self): clf = RGFClassifier() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, verbose=True, prefix='rgf_classifier', inc_prefix=True, calc_prob='Sigmoid', clean=True) clf.set_params(**valid_params) clf.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, verbose=-1, prefix='', inc_prefix=1, calc_prob=True, clean=0) for key in non_valid_params: clf.set_params(**valid_params) # Reset to valid params clf.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train)
def test_params(self): clf = RGFClassifier() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, calc_prob='sigmoid', n_jobs=-1, memory_policy='conservative', verbose=True) clf.set_params(**valid_params) clf.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, calc_prob=True, n_jobs='-1', memory_policy='Generos', verbose=-1) for key in non_valid_params: clf.set_params(**valid_params) # Reset to valid params clf.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, clf.fit, self.X_train, self.y_train)
def objective(max_leaf, l2, min_samples_leaf, learning_rate): max_leaf = int(max_leaf) min_samples_leaf = int(min_samples_leaf) assert type(max_leaf) == int assert type(min_samples_leaf) == int model = RGFClassifier( max_leaf=max_leaf, l2=l2, min_samples_leaf=min_samples_leaf, learning_rate=learning_rate, algorithm="RGF_Sib", test_interval=100, ) model.fit(train_m, label_m) pred_proba = model.predict_proba(train_val) score = roc_auc_score(label_val, pred_proba[:, 1]) return score
def train(params): # log hyperparams for this run for k, v in params.items(): mlflow.log_param(k, v) # load dataset files # NOTE: to get meta data, set allow_pickle=True for np.load, then index into dataset object with key 'meta' dataset = np.load('preprocessed/dataset.npz') X_arr = dataset['X_arr'] Y_arr = dataset['Y_arr'] # split for train-test X_train, X_test, Y_train, Y_test = train_test_split(X_arr, Y_arr, stratify=Y_arr, test_size=0.2) # instantiate model with params rgf_clf = RGFClassifier(**params) rgf_clf.fit(X_train, Y_train) # predict on test data Y_pred = rgf_clf.predict(X_test) Y_pred_proba = rgf_clf.predict_proba(X_test) # log logistic loss value logistic_loss = log_loss(Y_test, Y_pred_proba) mlflow.log_metric('log_loss', logistic_loss) # log precision, recall, f1 p, r, f, _ = precision_recall_fscore_support(y_true=Y_test, y_pred=Y_pred, average='binary') mlflow.log_metric('precision', p) mlflow.log_metric('recall', r) mlflow.log_metric('f1', f) # which features matter the most print("========== FEATURE IMPORTANCES ==========") print(rgf_clf.feature_importances_)
def model_pred(trn_tmp_x,trn_tmp_y,val_tmp_x,val_tmp_y,tst_x): best_iter = 1200 model = RGFClassifier(max_leaf=best_iter, #Try increasing this as a starter algorithm="RGF", loss="Log", l2=0.01, normalize=False, min_samples_leaf=20, learning_rate=0.5, verbose=False) fit_model = model.fit( trn_tmp_x, trn_tmp_y ) return fit_model.predict_proba(val_tmp_x)[:,1], fit_model.predict_proba(tst_x)[:,1],best_iter
def rgf(df: pd.DataFrame, target: pd.DataFrame, test: pd.DataFrame, parameters: Dict): n_splits = 5 # n_neighbors = parameters["n_neighbors"] folds = KFold(n_splits=n_splits, shuffle=True, random_state=42) oof = np.zeros((df.shape[0] + test.shape[0], 9)) for trn_idx, val_idx in folds.split(df, target): train_x = df.iloc[trn_idx, :].values val_x = df.iloc[val_idx, :].values train_y = target[trn_idx].values val_y = target[val_idx].values classifier = RGFClassifier( n_jobs=14, algorithm="RGF", loss="Log", ) classifier.fit(train_x, train_y) y_hat = classifier.predict_proba(val_x) print(log_loss(val_y, y_hat)) print(oof.shape, y_hat.shape) oof[val_idx] = y_hat pred = classifier.predict_proba(test.values) oof[len(target):, :] += pred / n_splits print(oof.shape) # np.save("data/04_features/oof.npz", oof) # oof = np.load("data/04_features/oof.npy") n_name = ["knn_{}".format(i) for i in range(9)] oof = pd.DataFrame(oof) oof.to_csv("data/09_oof/rgf_{}.csv".format(3)) return oof[len(target):].values
X_test = test_df.copy() logging.info("Fold {0}".format(i)) # Enocode data for f in f_cats: X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode( trn_series=X_train[f], val_series=X_valid[f], tst_series=X_test[f], target=y_train, min_samples_leaf=200, smoothing=10, noise_level=0 ) # Run model for this fold fit_model = model.fit(X_train, y_train) # Generate validation predictions for this fold pred = fit_model.predict_proba(X_valid)[:, 1] logging.info(" Gini = {0}".format(eval_gini(y_valid, pred))) y_valid_pred.iloc[test_index] = pred # Accumulate test set predictions probs = fit_model.predict_proba(X_test)[:, 1] y_test_pred += probs del X_test, X_train, X_valid, y_train y_test_pred /= K # Average test set predictions logging.info("Gini for full training set: {0}".format(eval_gini(y, y_valid_pred)))
import time from sklearn import datasets from sklearn.utils.validation import check_random_state from sklearn.ensemble import GradientBoostingClassifier from rgf.sklearn import RGFClassifier, FastRGFClassifier iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] start = time.time() clf = RGFClassifier() clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) end = time.time() print("RGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() clf = FastRGFClassifier() clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) end = time.time() print("FastRGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() clf = GradientBoostingClassifier()
else: blindloodata = pd.concat([blindloodata, blindtrain]) for c in highcardinality: test['loo' + c] = ProjectOnMean(train, test, c) test.drop(highcardinality, inplace=True, axis=1) train = blindloodata train.drop(highcardinality, inplace=True, axis=1) train = train.fillna(train.mean()) test = test.fillna(train.mean()) # In[ ]: rgf = RGFClassifier( max_leaf=1000, #Try increasing this as a starter algorithm="RGF_Sib", test_interval=250, loss="Log", verbose=True) rgf.fit(train[train.columns[2:]], train.target) x = rgf.predict_proba(train[train.columns[2:]]) print(GiniScore(train.target, x[:, 1])) # In[ ]: sub = pd.read_csv('../input/sample_submission.csv') x = rgf.predict_proba(test[test.columns[2:]]) sub.target = x[:, 1] sub.to_csv('rgfsubmission.csv', index=False)
def train_predict(train_df, test_df, params, model_name=None): if model_name == None: #model_name = 'l1_rgf_%s'%datetime.now().strftime('%m%d%H%M') model_name = 'l1_rgf' log = Logger(os.path.join('log', '%s.log' % model_name)) cols = [c for c in train_df.columns if c not in ['id', 'target']] log.info('Features:') for col in cols: log.info('- %s' % col) log.info('\n') log.info('Parameters:') param_items = params.items() for param_item in param_items: log.info('- %s: %s' % (param_item[0], str(param_item[1]))) log.info('\n') X = train_df[cols].values y = train_df['target'].values X_test = test_df[cols].values prob_train = np.zeros(len(X)) prob_test = np.zeros(len(X_test)) kfold = 5 scores = [] skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=41) for i, (train_ind, valid_ind) in enumerate(skf.split(X, y)): X_train, X_valid = X[train_ind], X[valid_ind] y_train, y_valid = y[train_ind], y[valid_ind] model = RGFClassifier(**params) model.fit(X_train, y_train) prob = model.predict_proba(X_valid)[:, 1] prob_train[valid_ind] = prob score = gini_norm(prob, y_valid) scores.append(score) log.info('- Fold %d/%d score: %f' % (i + 1, kfold, score)) prob = model.predict_proba(X_test)[:, 1] prob_test += prob / kfold try: subprocess.call('rm -rf /tmp/rgf/*', shell=True) print("Clean up is successfull") print(glob.glob("/tmp/rgf/*")) except Exception as e: print(str(e)) mean_score = np.mean(scores) log.info('- Mean score: %f' % mean_score) prob_train_df = pd.DataFrame({'id': train_df['id'], 'target': prob_train}) prob_train_df.to_csv(os.path.join('local_cv', '%s.csv.gz' % model_name), index=False, compression='gzip') prob_test_df = pd.DataFrame({'id': test_df['id'], 'target': prob_test}) prob_test_df.to_csv(os.path.join('submission', '%s.csv.gz' % model_name), index=False, compression='gzip') return mean_score
# Enocode data for f in f_cats: X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode( trn_series=X_train[f], val_series=X_valid[f], tst_series=X_test[f], target=y_train, min_samples_leaf=200, smoothing=10, noise_level=0) # Run model for this fold if USE_RGF_INSTEAD: X_train = X_train.fillna(X_train.mean()) rgf.fit(X_train, y_train) elif OPTIMIZE_XGB_ROUNDS: eval_set = [(X_valid, y_valid)] fit_model = xgbmodel.fit( X_train, y_train, eval_set=eval_set, eval_metric=gini_xgb, early_stopping_rounds=XGB_EARLY_STOPPING_ROUNDS, verbose=False) print(" Best N trees = ", xgbmodel.best_ntree_limit) print(" Best gini = ", xgbmodel.best_score) elif USE_LIGHTGBM: dtrain = lgbm.Dataset(X_train, y_train) dvalid = lgbm.Dataset(X_valid, y_valid, reference=dtrain)
class Level1Model(object): train_features = [ "ps_car_13", # : 1571.65 / shadow 609.23 "ps_reg_03", # : 1408.42 / shadow 511.15 "ps_ind_05_cat", # : 1387.87 / shadow 84.72 "ps_ind_03", # : 1219.47 / shadow 230.55 "ps_ind_15", # : 922.18 / shadow 242.00 "ps_reg_02", # : 920.65 / shadow 267.50 "ps_car_14", # : 798.48 / shadow 549.58 "ps_car_12", # : 731.93 / shadow 293.62 "ps_car_01_cat", # : 698.07 / shadow 178.72 "ps_car_07_cat", # : 694.53 / shadow 36.35 "ps_ind_17_bin", # : 620.77 / shadow 23.15 "ps_car_03_cat", # : 611.73 / shadow 50.67 "ps_reg_01", # : 598.60 / shadow 178.57 "ps_car_15", # : 593.35 / shadow 226.43 "ps_ind_01", # : 547.32 / shadow 154.58 "ps_ind_16_bin", # : 475.37 / shadow 34.17 "ps_ind_07_bin", # : 435.28 / shadow 28.92 "ps_car_06_cat", # : 398.02 / shadow 212.43 "ps_car_04_cat", # : 376.87 / shadow 76.98 "ps_ind_06_bin", # : 370.97 / shadow 36.13 "ps_car_09_cat", # : 214.12 / shadow 81.38 "ps_car_02_cat", # : 203.03 / shadow 26.67 "ps_ind_02_cat", # : 189.47 / shadow 65.68 "ps_car_11", # : 173.28 / shadow 76.45 "ps_car_05_cat", # : 172.75 / shadow 62.92 "ps_calc_09", # : 169.13 / shadow 129.72 "ps_calc_05", # : 148.83 / shadow 120.68 "ps_ind_08_bin", # : 140.73 / shadow 27.63 "ps_car_08_cat", # : 120.87 / shadow 28.82 "ps_ind_09_bin", # : 113.92 / shadow 27.05 "ps_ind_04_cat", # : 107.27 / shadow 37.43 "ps_ind_18_bin", # : 77.42 / shadow 25.97 "ps_ind_12_bin", # : 39.67 / shadow 15.52 "ps_ind_14", # : 37.37 / shadow 16.65 ] combs = [ ('ps_reg_01', 'ps_car_02_cat'), ('ps_reg_01', 'ps_car_04_cat'), ] def __init__(self, strat=True, splits=5, random_state=15, submit=False, mean_sub=False, metric=None): # type: (bool, int, int, bool, bool, Callable) -> None self.curr_date = datetime.datetime.now() self._submit = submit self._id = "" self.trn = None self.target = None self.sub = None self.model = None self.metric = metric self.mean_submission = mean_sub if strat: self._folds = StratifiedKFold(n_splits=splits, shuffle=True, random_state=random_state) else: self._folds = KFold(n_splits=splits, shuffle=True, random_state=random_state) self.set_model() def set_model(self): self.model = RGFClassifier(max_leaf=1000, # 1000, algorithm="RGF", # RGF_Sib, RGF_Opt loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=10, n_iter=None, opt_interval=100, learning_rate=.5, calc_prob="sigmoid", n_jobs=-1, memory_policy="generous", verbose=0 ) @property def do_submission(self): return self._submit @property def id(self): return self._get_id() @abc.abstractmethod def _get_id(self): self._id = "rgf_full_feat_" if self._id == "": raise ValueError("Id is not set for class " + str(type(self))) return self._id def read_data(self): self.trn = pd.read_csv("../../input/train.csv", index_col=0) self.target = self.trn["target"] del self.trn["target"] if self.do_submission: self.sub = pd.read_csv("../../input/test.csv", index_col=0) def add_combinations(self): # type: (...) -> (pd.DataFrame, Optional[DataFrame]) start = time.time() for n_c, (f1, f2) in enumerate(self.combs): name1 = f1 + "_plus_" + f2 print('current feature %60s %4d in %5.1f' % (name1, n_c + 1, (time.time() - start) / 60), end='') print('\r' * 75, end='') self.trn[name1] = self.trn[f1].apply(lambda x: str(x)) + "_" + self.trn[f2].apply(lambda x: str(x)) if self.do_submission: self.sub[name1] = self.sub[f1].apply(lambda x: str(x)) + "_" + self.sub[f2].apply(lambda x: str(x)) self.trn[name1], indexer = pd.factorize(self.trn[name1]) self.sub[name1] = indexer.get_indexer(self.sub[name1]) else: self.trn[name1], _ = pd.factorize(self.trn[name1]) def prepare_data(self): noisy_features = list(set(self.trn.columns) - set(self.train_features)) # Bin continuous variables before One-Hot Encoding for f in ["ps_reg_03", "ps_car_12", "ps_car_13", "ps_car_14"]: full_f = pd.concat([self.trn[f], self.sub[f]], axis=0) full_cut = np.array(pd.cut(full_f, 50, labels=False)) self.trn[f] = full_cut[:len(self.trn)] self.sub[f] = full_cut[len(self.trn):] del full_f del full_cut self.add_combinations() # Remove noisy features self.trn.drop(noisy_features, axis=1, inplace=True) if self.do_submission: self.sub.drop(noisy_features, axis=1, inplace=True) print(self.trn.columns) def predict_oof_and_submission(self): self.read_data() self.prepare_data() pos_ratio = .3 class_weight = {0: 1 / (2 * (1 - pos_ratio)), 1: 1 / (2 * pos_ratio)} if self.model is None: raise ValueError("Model is not set for class " + str(type(self))) if self.target is None: raise ValueError("Model is not set for class " + str(type(self))) if self.trn is None: raise ValueError("Model is not set for class " + str(type(self))) if (self.sub is None) and self.do_submission: raise ValueError("Model is not set for class " + str(type(self))) # Prepare predictors oof_preds = np.zeros(len(self.trn)) if self.sub is not None: sub_preds = np.zeros(len(self.sub)) # Go through folds start = time.time() f_cats = [f for f in self.trn.columns if "_cat" in f] for i_fold, (trn_idx, val_idx) in enumerate(self._folds.split(self.target, self.target)): # Split data trn_x, trn_y = self.trn.iloc[trn_idx].copy(), self.target.iloc[trn_idx] val_x, val_y = self.trn.iloc[val_idx].copy(), self.target.iloc[val_idx] # Compute target averages for f in f_cats: ft = TargetAverageTransformation(feature_name=f, average=TargetAverageTransformation.MEAN, min_samples_leaf=200, smoothing=10, noise_level=0) trn_x[f + "_avg"] = ft.fit_transform(data=trn_x, target=trn_y) val_x[f + "_avg"] = ft.transform(data=val_x) if self.do_submission: self.sub[f + "_avg"] = ft.transform(data=self.sub) # Fit model eval_sets = [(trn_x.values, trn_y.values), (val_x.values, val_y.values)] sample_weight = trn_y.apply(lambda x: class_weight[x]).values self.model.fit(trn_x.values, trn_y.values) # Predict OOF oof_preds[val_idx] = self.model.predict_proba(val_x.values)[:, 1] # Predict SUB if mean is requested if (self.sub is not None) and self.mean_submission: sub_preds += self.model.predict_proba(self.sub.values)[:, 1] / self._folds.n_splits # Print results of current fold print("Fold %2d score : %.6f in [%5.1f]" % (i_fold + 1, self.metric(val_y, oof_preds[val_idx]), (time.time() - start) / 60)) del trn_x del val_x gc.collect() # display OOF result oof_score = self.metric(self.target, oof_preds) print("Full OOF score : %.6f" % oof_score) # Check if we need to fit the model on the full dataset if (self.sub is not None) and not self.mean_submission: # Compute target averages for f in f_cats: ft = TargetAverageTransformation(feature_name=f, average=TargetAverageTransformation.MEAN, min_samples_leaf=200, smoothing=10, noise_level=0) self.trn[f + "_avg"] = ft.fit_transform(data=self.trn, target=self.target) self.sub[f + "_avg"] = ft.transform(data=self.sub) # Fit model self.model.fit(self.trn, self.target) # Compute prediction for submission sub_preds = self.model.predict_proba(self.sub)[:, 1] if self.do_submission: filename = "../output_preds/" + self.id filename += str(int(1e6 * oof_score)) + "_" filename += self.curr_date.strftime("%Y_%m_%d_%Hh%M") # Save OOF predictions for stacking self.trn[self.id] = oof_preds self.trn[[self.id]].to_csv(filename + "_oof.csv", float_format="%.9f") # Save submission prediction for stacking or submission self.sub["target"] = sub_preds self.sub[["target"]].to_csv(filename + "_sub.csv", float_format="%.9f")