def select_best_feature(df, y, sorted_feature_name): auc_ls = [] for k in range(8, len(sorted_feature_name), 1): selected_k_feature = sorted_feature_name[:k] print(selected_k_feature) train_len = int(len(df[selected_k_feature]) * 0.75) # train_set = df[selected_k_feature] # train_len = int(len(df[selected_k_feature])*0.75) # category_cols = [fea for fea in selected_k_feature if not fea.endswith("bin")] # train default classifier # categorical_features_indices = [df[selected_k_feature].columns.get_loc(i) for i in category_cols] model = CatBoostClassifier( iterations=50, random_seed=42, verbose=2).fit(X=df[selected_k_feature].iloc[:train_len], y=y[:train_len]) metrics = model.eval_metrics( Pool(df[selected_k_feature].iloc[train_len:], y[train_len:]), ['AUC']) mean_auc = sum(metrics['AUC']) / float(len(metrics['AUC'])) print((k, mean_auc)) auc_ls.append((k, mean_auc)) sorted_ll = sorted(auc_ls, key=lambda x: x[1], reverse=True) print(sorted_ll) best_k = sorted_ll[0][0] print(best_k) selected_k_feature = sorted_feature_name[:best_k] print(selected_k_feature) with open("tencent_stats.csv", 'a', newline='') as f: writer = csv.writer(f) writer.writerow(["the selected best k features"]) writer.writerow(selected_k_feature) return selected_k_feature
(thresholds, fnr) = get_fnr_curve(curve=curve) plt.figure(figsize=(16, 8)) lw = 2 plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5) plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.grid(True) plt.xlabel('Threshold', fontsize=16) plt.ylabel('Error Rate', fontsize=16) #plt.title('FPR-FNR curves', fontsize=20) plt.legend(loc="lower left", fontsize=16) plt.show() #find threshold from catboost.utils import select_threshold print(select_threshold(model=model, data=eval_train_pool, FNR=0.2)) print(select_threshold(model=model, data=eval_train_pool, FPR=0.4)) #confusion matrix print(get_confusion_matrix(model, data=eval_pool)) from catboost.utils import get_confusion_matrix #result show test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices) from catboost import Pool model.get_all_params() #params model.eval_metrics(data=eval_pool, metrics='Recall') model.score(test_pool) result = model.predict_proba(eval_test_pool)
fea_val_new = p.fit_transform(fea_val_top100) model_new = CatBoostClassifier( learning_rate=0.2, # l2_leaf_reg = 2, # random_strength = 3, iterations=210, eval_metric='Accuracy', random_seed=42, logging_level='Verbose', use_best_model=True, task_type='GPU', ) model_new.fit(fea_train_new, labs_train, eval_set=(fea_val_new, labs_val)) eval_metrics = model.eval_metrics(Pool(fea_val, labs_val), ['AUC']) ee = [] for e in eval_metrics: ee.append(eval_metrics[e]) ee = np.array(ee) # # # 预测数据集 # print('Start predicting...') # y_pred_val = model.predict(fea_val) # print('The acc of prediction is:', sum(labs_val==y_pred_val.squeeze()) / len(y_pred_val)) # # y_pred_test = model.predict(fea_test) # y_pred_test = (y_pred_test.squeeze()).astype(np.uint8) # # f = open(r"data/out-label-catboost.txt", "w+") # cnt = 0
def catboost_bootstrap(dir_, learn_name, test_name, cd_file, classes, learning_rate=None, border_count=32, cnt_values=20, file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500): logloss = {} auc = {} for clazz in classes: print('class={}'.format(clazz.WRAPPER_NAME)) print('class={}; step={}'.format(clazz.WRAPPER_NAME, learning_rate[clazz]), file=file_result_to) file_result_to.flush() auc[clazz.WRAPPER_NAME] = [] logloss[clazz.WRAPPER_NAME] = [] tree_counts = [] logloss_curves = [] auc_curves = [] cl = clazz() source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join( dir_, cd_file)) beg = time.time() learn_pool = cl.handle_learn_pool(source_learn_pool) end = time.time() print('!!!time: {}'.format(end - beg), file=file_info_to) print('priors: {}'.format(cl.prior), file=file_info_to) print('prior scores: {}'.format(cl.score), file=file_info_to) file_info_to.flush() source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file)) source_test_label = np.array(source_test_pool.get_label()) source_test_features = np.array(source_test_pool.get_features()) cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=0, border_count=border_count, iterations=iterations, learning_rate=learning_rate[clazz], thread_count=multiprocessing.cpu_count()) beg = time.time() cat.fit(learn_pool, use_best_model=True) end = time.time() for seed in range(cnt_values): idx = list(range(source_test_features.shape[0])) np.random.seed(seed * 10 + 300) boot_idx = np.random.choice(idx, len(idx), replace=True) boot_test_features = source_test_features[boot_idx] boot_test_label = source_test_label[boot_idx] X, y = cl.handle_test_matrix(boot_test_features, boot_test_label, False) metrics = cat.eval_metrics( Pool(X, y), ['Logloss', 'AUC'], eval_period=1, thread_count=multiprocessing.cpu_count()) for num, loss in enumerate(metrics['Logloss']): print('iter={:10}: loss={:.10}'.format(num + 1, loss)) cnt_trees = np.argmin(metrics['Logloss']) print('choose cnt_trees={}'.format(cnt_trees)) print('overfit={}; AUC={}; logloss={}'.format( cnt_trees, metrics['AUC'][cnt_trees], metrics['Logloss'][cnt_trees]), file=file_result_to) tree_counts.append(cnt_trees) file_result_to.flush() logloss_curves.append(metrics['Logloss']) auc_curves.append(metrics['AUC']) auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees]) logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees]) print('class={}, learn_time={}, mean_tree_count={}'.format( clazz.WRAPPER_NAME, end - beg, sum(tree_counts) / len(tree_counts)), file=file_result_to) print('mean_AUC={}, mean_logloss={}'.format( sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]), sum(logloss[clazz.WRAPPER_NAME]) / len(logloss[clazz.WRAPPER_NAME])), file=file_result_to) file_result_to.flush() logloss_fig = create_learning_curves_plot( logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME)) auc_fig = create_learning_curves_plot( auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME)) logloss_file = os.path.join( dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME)) AUC_file = os.path.join(dir_, 'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME)) plot(logloss_fig, filename=logloss_file, auto_open=False) plot(auc_fig, filename=AUC_file, auto_open=False) file_name = os.path.join(dir_, 'boot.txt') with open(file_name, 'w') as file_to: json.dump(auc, file_to) for cl1 in classes: for cl2 in classes: stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME], auc[cl2.WRAPPER_NAME], zero_method="pratt") print('for {} & {}: stat: {}, p_value: {}'.format( cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value), file=file_result_to)
preds_class = model.predict(test_pool, prediction_type='Class') preds_prob = model.predict(test_pool, prediction_type='Probability') print('Class', preds_class, preds_class.shape) print('Prob', preds_prob, preds_prob.shape) for i in range(len(features)): print(features[i], model.feature_importances_[i]) eval_metrics = model.eval_metrics(test_pool, ['AUC']) print('Evaluation AUC on Test pool') print(eval_metrics) ################################################################################ # save submission info ################################################################################### print('running submission data through trained classifier') submission_data = pd.read_csv('clean_submit.csv', index_col=0, dtype=object) submission_data = submission_data[features] submission_pool = Pool(submission_data) submission_prob = model.predict(submission_pool, prediction_type='Probability')
feature_names = X_train.columns for score, name in sorted(zip(feature_importances, feature_names), reverse=True): print('{}: {}'.format(name, score)) # This shows that features **`Sex`** and **`Pclass`** had the biggest influence on the result. # ### 3.9 Eval Metrics # The CatBoost has a `eval_metrics` method that allows to calculate a given metrics on a given dataset. And to draw them of course:) # In[31]: model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool) eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True) # In[32]: print(eval_metrics['AUC'][:6]) # ### 3.10 Learning Processes Comparison # You can also compare different models learning process on a single plot. # In[33]: model1 = CatBoostClassifier(iterations=10, depth=1, train_dir='model_depth_1/', logging_level='Silent') model1.fit(train_pool, eval_set=validate_pool)
depth=10, #l2_leaf_reg = 10, #border_count=254, verbose=True, use_best_model=True, scale_pos_weight=scale_pos_weight, eval_metric='Precision', thread_count=int(cpus / 2), loss_function='Logloss') # Train the model on training data model.fit(cattrain, eval_set=cattest, plot=False) print(model.get_best_iteration()) #Pecision metrics train_precision = model.eval_metrics(cattrain, "Precision") print("train precision", train_precision.get("Precision")[model.get_best_iteration()]) ptrain = train_precision.get("Precision")[model.get_best_iteration()] test_precision = model.eval_metrics(cattest, "Precision") print("test precision", test_precision.get("Precision")[model.get_best_iteration()]) ptest = test_precision.get("Precision")[model.get_best_iteration()] valid_precision = model.eval_metrics(catvalid, "Precision") print("valid precision", valid_precision.get("Precision")[model.get_best_iteration()]) pvalid = valid_precision.get("Precision")[model.get_best_iteration()] model.save_model("Catboost_Sol")
def catboost_test(dir_, cur_learn_name, cur_test_name, clazz, learning_rate=None, border_count=128, cnt_models=1, file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500): full_learn_name = os.path.join(dir_, cur_learn_name) full_test_name = os.path.join(dir_, cur_test_name) if not os.path.exists(full_learn_name): source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join(dir_, cd_file)) source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file)) cl = clazz() beg = time.time() learn_pool = cl.handle_learn_pool(source_learn_pool) test_pool = cl.handle_test_pool(source_test_pool) end = time.time() print('!!!time: {}'.format(end - beg), file=file_info_to) print('priors: {}'.format(cl.prior), file=file_info_to) print('prior scores: {}'.format(cl.score), file=file_info_to) file_info_to.flush() learn_label = learn_pool.get_label() learn_features = learn_pool.get_features() learn_data = np.zeros((len(learn_label), len(learn_features[0]) + 1)) learn_data[:, 0] = learn_label learn_data[:, 1:] = learn_features np.savetxt(full_learn_name, learn_data, delimiter='\t', fmt='%.10f') test_label = test_pool.get_label() test_features = test_pool.get_features() test_data = np.zeros((len(test_label), len(test_features[0]) + 1)) test_data[:, 0] = test_label test_data[:, 1:] = test_features np.savetxt(full_test_name, test_data, delimiter='\t', fmt='%.10f') learn_pool = Pool(data=full_learn_name) test_pool = Pool(data=full_test_name) scores = [] auc = [] logloss = [] times =[] tree_counts = [] for seed in range(cnt_models): print(seed) # print(len(learn_pool.get_features()), len(learn_pool.get_features()[0])) # print(len(test_pool.get_features()), len(test_pool.get_features()[0])) beg = time.time() cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=seed, border_count=border_count, iterations=iterations, learning_rate=learning_rate, thread_count=multiprocessing.cpu_count()) cat.fit(learn_pool, eval_set=(test_pool), use_best_model=True) end = time.time() X_test = test_pool.get_features() y_test = test_pool.get_label() tree_counts.append(cat.tree_count_) scores.append(cat.score(X_test, y_test)) metrics = cat.eval_metrics(test_pool, ['AUC', 'Logloss'], eval_period=cat.tree_count_ - 1) print('overfit={}; acc={}; AUC={}; logloss={}; learn_time={}'.format(cat.tree_count_, scores[-1], metrics['AUC'][1], metrics['Logloss'][1], end - beg), file=file_result_to) file_result_to.flush() auc.append(metrics['AUC'][1]) logloss.append(metrics['Logloss'][1]) times.append(end - beg) if len(tree_counts) != 0: print('mean tree_count: {}'.format(sum(tree_counts)/len(tree_counts)), file=file_result_to) return sum(scores)/len(scores), sum(auc)/len(auc), sum(logloss)/len(logloss), sum(times)/len(times) else: return 0, 0, 0, 0
calculate them using y_true and the predicted values. model = CatBoostClassifier(**model_params, verbose=False, custom_metric=['Logloss', 'AUC', 'F1', 'PRAUC']) ''' model = CatBoostClassifier(**model_params, ) # model = CatBoostClassifier(**model_params, verbose=False) model.fit(train_data, eval_set=test_data, verbose=False, plot=False) model.save_model(f"{project_dir}/model/model.cbm") builtin_metrics = model.eval_metrics(train_data, metrics=['Logloss', 'AUC', 'F1', 'PRAUC']) # write results hold_out_score = model.get_best_score() # write_eval_summary_file(cv_scores, hold_out_score) predict_probas = model.predict_proba(test_data) test_data_metrics = calculate_metrics(build_spec['standard_metrics'], build_spec['custom_metrics'], y_test, predict_probas[:, 1]) write_eval_summary_file(cv_scores, test_data_metrics)