'reg_lambda': .1, 'subsample': .9, 'min_split_gain': .01, 'min_child_weight': 2, 'colsample_bytree': .9, # Subsample ratio of columns when constructing each tree. 'scale_pos_weight': 9, # because training data is unbalanced 'verbose': -1 } features = list(pd.read_csv(DATA_FOLDER + '/v3/importances.csv', index_col=0).head(800).index) train_features = [*features, "target"] train = pd.read_pickle(DATA_FOLDER + '/v3/train.pkl')[train_features] folds = prepare_folds(train) models, result = train_folds(folds, config) test = load_test(DATA_FOLDER + '/v3/test.pkl')[features] test_target = evaluate(models, test) print("AUC: %.4f, F1: %.4f" % (result['auc'], result['f1'])) importance = result['importances'].groupby(['feature']) \ .agg({'importance': 'mean'}) \ .sort_values(by="importance", ascending=False) importance.to_csv(DATA_FOLDER + "/v3/importances.csv") prepare_submission(test_target, "v3_AUC_%.4f_F1_%.4f" % (result['auc'], result['f1']))
} train = pd.read_pickle(DATA_FOLDER + '/v2/train.pkl') inter_features = list(pd.read_csv(DATA_FOLDER + '/v2/importances.csv', index_col=0).head(200).index) categorical = set(train.columns[train.dtypes == int]).intersection(inter_features) cats = [[x] for x in categorical] folds = prepare_folds(train) folds = mean_encode_train_fold(folds, cats) models, result = train_folds(folds, config) test = load_test(DATA_FOLDER + '/v2/test.pkl') test = mean_encode_test(train, test, cats) test_target = evaluate(models, test) print("AUC: %.4f, F1: %.4f" % (result['auc'], result['f1'])) importance = result['importances'].groupby(['feature']) \ .agg({'importance': 'mean'}) \ .sort_values(by="importance", ascending=False) importance.to_csv(DATA_FOLDER + "/v4/importances.csv") prepare_submission(test_target, "v4_AUC_%.4f_F1_%.4f" % (result['auc'], result['f1']))