示例#1
0
def select_best_feature(df, y, sorted_feature_name):
    auc_ls = []
    for k in range(8, len(sorted_feature_name), 1):
        selected_k_feature = sorted_feature_name[:k]
        print(selected_k_feature)
        train_len = int(len(df[selected_k_feature]) * 0.75)
        # train_set = df[selected_k_feature]
        # train_len = int(len(df[selected_k_feature])*0.75)
        # category_cols = [fea for fea in selected_k_feature if not fea.endswith("bin")]
        # train default classifier
        # categorical_features_indices = [df[selected_k_feature].columns.get_loc(i) for i in category_cols]
        model = CatBoostClassifier(
            iterations=50, random_seed=42,
            verbose=2).fit(X=df[selected_k_feature].iloc[:train_len],
                           y=y[:train_len])
        metrics = model.eval_metrics(
            Pool(df[selected_k_feature].iloc[train_len:], y[train_len:]),
            ['AUC'])
        mean_auc = sum(metrics['AUC']) / float(len(metrics['AUC']))
        print((k, mean_auc))
        auc_ls.append((k, mean_auc))
    sorted_ll = sorted(auc_ls, key=lambda x: x[1], reverse=True)
    print(sorted_ll)
    best_k = sorted_ll[0][0]
    print(best_k)
    selected_k_feature = sorted_feature_name[:best_k]
    print(selected_k_feature)
    with open("tencent_stats.csv", 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["the selected best k features"])
        writer.writerow(selected_k_feature)
    return selected_k_feature
(thresholds, fnr) = get_fnr_curve(curve=curve)
plt.figure(figsize=(16, 8))
lw = 2
plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5)
plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
#plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16)
plt.show()
#find threshold
from catboost.utils import select_threshold
print(select_threshold(model=model, data=eval_train_pool, FNR=0.2))
print(select_threshold(model=model, data=eval_train_pool, FPR=0.4))
#confusion matrix
print(get_confusion_matrix(model, data=eval_pool))
from catboost.utils import get_confusion_matrix
#result show
test_pool = Pool(X_test, y_test, cat_features=categorical_features_indices)
from catboost import Pool
model.get_all_params()  #params

model.eval_metrics(data=eval_pool, metrics='Recall')
model.score(test_pool)
result = model.predict_proba(eval_test_pool)
示例#3
0
    fea_val_new = p.fit_transform(fea_val_top100)

    model_new = CatBoostClassifier(
        learning_rate=0.2,
        #            l2_leaf_reg = 2,
        #            random_strength = 3,
        iterations=210,
        eval_metric='Accuracy',
        random_seed=42,
        logging_level='Verbose',
        use_best_model=True,
        task_type='GPU',
    )
    model_new.fit(fea_train_new, labs_train, eval_set=(fea_val_new, labs_val))

    eval_metrics = model.eval_metrics(Pool(fea_val, labs_val), ['AUC'])
    ee = []
    for e in eval_metrics:
        ee.append(eval_metrics[e])
    ee = np.array(ee)
#
#    # 预测数据集
#    print('Start predicting...')
#    y_pred_val = model.predict(fea_val)
#    print('The acc of prediction is:', sum(labs_val==y_pred_val.squeeze()) / len(y_pred_val))
#
#    y_pred_test = model.predict(fea_test)
#    y_pred_test = (y_pred_test.squeeze()).astype(np.uint8)
#
#    f = open(r"data/out-label-catboost.txt", "w+")
#    cnt = 0
示例#4
0
def catboost_bootstrap(dir_,
                       learn_name,
                       test_name,
                       cd_file,
                       classes,
                       learning_rate=None,
                       border_count=32,
                       cnt_values=20,
                       file_result_to=sys.stdout,
                       file_info_to=sys.stdout,
                       iterations=1500):
    logloss = {}
    auc = {}
    for clazz in classes:
        print('class={}'.format(clazz.WRAPPER_NAME))
        print('class={}; step={}'.format(clazz.WRAPPER_NAME,
                                         learning_rate[clazz]),
              file=file_result_to)
        file_result_to.flush()
        auc[clazz.WRAPPER_NAME] = []
        logloss[clazz.WRAPPER_NAME] = []
        tree_counts = []
        logloss_curves = []
        auc_curves = []

        cl = clazz()
        source_learn_pool = Pool(data=os.path.join(dir_, learn_name),
                                 column_description=os.path.join(
                                     dir_, cd_file))
        beg = time.time()
        learn_pool = cl.handle_learn_pool(source_learn_pool)
        end = time.time()
        print('!!!time: {}'.format(end - beg), file=file_info_to)
        print('priors: {}'.format(cl.prior), file=file_info_to)
        print('prior scores: {}'.format(cl.score), file=file_info_to)
        file_info_to.flush()

        source_test_pool = Pool(data=os.path.join(dir_, test_name),
                                column_description=os.path.join(dir_, cd_file))
        source_test_label = np.array(source_test_pool.get_label())
        source_test_features = np.array(source_test_pool.get_features())

        cat = CatBoostClassifier(max_ctr_complexity=1,
                                 custom_metric='AUC',
                                 boosting_type='Plain',
                                 random_seed=0,
                                 border_count=border_count,
                                 iterations=iterations,
                                 learning_rate=learning_rate[clazz],
                                 thread_count=multiprocessing.cpu_count())
        beg = time.time()
        cat.fit(learn_pool, use_best_model=True)
        end = time.time()

        for seed in range(cnt_values):
            idx = list(range(source_test_features.shape[0]))
            np.random.seed(seed * 10 + 300)
            boot_idx = np.random.choice(idx, len(idx), replace=True)
            boot_test_features = source_test_features[boot_idx]
            boot_test_label = source_test_label[boot_idx]
            X, y = cl.handle_test_matrix(boot_test_features, boot_test_label,
                                         False)
            metrics = cat.eval_metrics(
                Pool(X, y), ['Logloss', 'AUC'],
                eval_period=1,
                thread_count=multiprocessing.cpu_count())
            for num, loss in enumerate(metrics['Logloss']):
                print('iter={:10}:     loss={:.10}'.format(num + 1, loss))
            cnt_trees = np.argmin(metrics['Logloss'])
            print('choose cnt_trees={}'.format(cnt_trees))
            print('overfit={}; AUC={}; logloss={}'.format(
                cnt_trees, metrics['AUC'][cnt_trees],
                metrics['Logloss'][cnt_trees]),
                  file=file_result_to)
            tree_counts.append(cnt_trees)
            file_result_to.flush()
            logloss_curves.append(metrics['Logloss'])
            auc_curves.append(metrics['AUC'])
            auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees])
            logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees])

        print('class={}, learn_time={}, mean_tree_count={}'.format(
            clazz.WRAPPER_NAME, end - beg,
            sum(tree_counts) / len(tree_counts)),
              file=file_result_to)
        print('mean_AUC={}, mean_logloss={}'.format(
            sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]),
            sum(logloss[clazz.WRAPPER_NAME]) /
            len(logloss[clazz.WRAPPER_NAME])),
              file=file_result_to)
        file_result_to.flush()

        logloss_fig = create_learning_curves_plot(
            logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME))
        auc_fig = create_learning_curves_plot(
            auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME))
        logloss_file = os.path.join(
            dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME))
        AUC_file = os.path.join(dir_,
                                'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME))
        plot(logloss_fig, filename=logloss_file, auto_open=False)
        plot(auc_fig, filename=AUC_file, auto_open=False)

    file_name = os.path.join(dir_, 'boot.txt')
    with open(file_name, 'w') as file_to:
        json.dump(auc, file_to)

    for cl1 in classes:
        for cl2 in classes:
            stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME],
                                     auc[cl2.WRAPPER_NAME],
                                     zero_method="pratt")
            print('for {} & {}: stat: {}, p_value: {}'.format(
                cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value),
                  file=file_result_to)
示例#5
0
preds_class = model.predict(test_pool, prediction_type='Class')
preds_prob = model.predict(test_pool, prediction_type='Probability')

print('Class', preds_class, preds_class.shape)
print('Prob', preds_prob, preds_prob.shape)



for i in range(len(features)):
    print(features[i], model.feature_importances_[i])

    


eval_metrics = model.eval_metrics(test_pool, ['AUC'])
print('Evaluation AUC on Test pool')
print(eval_metrics)



################################################################################
# save submission info
###################################################################################
print('running submission data through trained classifier')
submission_data = pd.read_csv('clean_submit.csv', index_col=0, dtype=object)
submission_data = submission_data[features]

submission_pool = Pool(submission_data)
submission_prob = model.predict(submission_pool, prediction_type='Probability')
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names),
                          reverse=True):
    print('{}: {}'.format(name, score))

# This shows that features **`Sex`** and **`Pclass`** had the biggest influence on the result.

# ### 3.9 Eval Metrics
# The CatBoost has a `eval_metrics` method that allows to calculate a given metrics on a given dataset. And to draw them of course:)

# In[31]:

model = CatBoostClassifier(iterations=50,
                           random_seed=42,
                           logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=True)

# In[32]:

print(eval_metrics['AUC'][:6])

# ### 3.10 Learning Processes Comparison
# You can also compare different models learning process on a single plot.

# In[33]:

model1 = CatBoostClassifier(iterations=10,
                            depth=1,
                            train_dir='model_depth_1/',
                            logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
示例#7
0
    depth=10,
    #l2_leaf_reg = 10,
    #border_count=254,
    verbose=True,
    use_best_model=True,
    scale_pos_weight=scale_pos_weight,
    eval_metric='Precision',
    thread_count=int(cpus / 2),
    loss_function='Logloss')

# Train the model on training data
model.fit(cattrain, eval_set=cattest, plot=False)
print(model.get_best_iteration())

#Pecision metrics
train_precision = model.eval_metrics(cattrain, "Precision")
print("train precision",
      train_precision.get("Precision")[model.get_best_iteration()])
ptrain = train_precision.get("Precision")[model.get_best_iteration()]

test_precision = model.eval_metrics(cattest, "Precision")
print("test precision",
      test_precision.get("Precision")[model.get_best_iteration()])
ptest = test_precision.get("Precision")[model.get_best_iteration()]

valid_precision = model.eval_metrics(catvalid, "Precision")
print("valid precision",
      valid_precision.get("Precision")[model.get_best_iteration()])
pvalid = valid_precision.get("Precision")[model.get_best_iteration()]

model.save_model("Catboost_Sol")
def catboost_test(dir_, cur_learn_name, cur_test_name, clazz, learning_rate=None, border_count=128, cnt_models=1,
                  file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500):
    full_learn_name = os.path.join(dir_, cur_learn_name)
    full_test_name = os.path.join(dir_, cur_test_name)

    if not os.path.exists(full_learn_name):
        source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join(dir_, cd_file))
        source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file))
        cl = clazz()
        beg = time.time()
        learn_pool = cl.handle_learn_pool(source_learn_pool)
        test_pool = cl.handle_test_pool(source_test_pool)
        end = time.time()
        print('!!!time: {}'.format(end - beg), file=file_info_to)
        print('priors: {}'.format(cl.prior), file=file_info_to)
        print('prior scores: {}'.format(cl.score), file=file_info_to)
        file_info_to.flush()
        learn_label = learn_pool.get_label()
        learn_features = learn_pool.get_features()
        learn_data = np.zeros((len(learn_label), len(learn_features[0]) + 1))
        learn_data[:, 0] = learn_label
        learn_data[:, 1:] = learn_features
        np.savetxt(full_learn_name, learn_data, delimiter='\t', fmt='%.10f')
        test_label = test_pool.get_label()
        test_features = test_pool.get_features()
        test_data = np.zeros((len(test_label), len(test_features[0]) + 1))
        test_data[:, 0] = test_label
        test_data[:, 1:] = test_features
        np.savetxt(full_test_name, test_data, delimiter='\t', fmt='%.10f')

    learn_pool = Pool(data=full_learn_name)
    test_pool = Pool(data=full_test_name)

    scores = []
    auc = []
    logloss = []
    times =[]
    tree_counts = []
    for seed in range(cnt_models):
        print(seed)
        # print(len(learn_pool.get_features()), len(learn_pool.get_features()[0]))
        # print(len(test_pool.get_features()), len(test_pool.get_features()[0]))
        beg = time.time()
        cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=seed, border_count=border_count, iterations=iterations, learning_rate=learning_rate, thread_count=multiprocessing.cpu_count())
        cat.fit(learn_pool, eval_set=(test_pool), use_best_model=True)
        end = time.time()
        X_test = test_pool.get_features()
        y_test = test_pool.get_label()

        tree_counts.append(cat.tree_count_)
        scores.append(cat.score(X_test, y_test))
        metrics = cat.eval_metrics(test_pool, ['AUC', 'Logloss'], eval_period=cat.tree_count_ - 1)
        print('overfit={}; acc={}; AUC={}; logloss={}; learn_time={}'.format(cat.tree_count_, scores[-1], metrics['AUC'][1], metrics['Logloss'][1], end - beg), file=file_result_to)
        file_result_to.flush()
        auc.append(metrics['AUC'][1])
        logloss.append(metrics['Logloss'][1])
        times.append(end - beg)
    if len(tree_counts) != 0:
        print('mean tree_count: {}'.format(sum(tree_counts)/len(tree_counts)), file=file_result_to)
        return sum(scores)/len(scores), sum(auc)/len(auc), sum(logloss)/len(logloss), sum(times)/len(times)
    else:
        return 0, 0, 0, 0
示例#9
0
calculate them using y_true and the predicted values.

model = CatBoostClassifier(**model_params, verbose=False, custom_metric=['Logloss', 'AUC', 'F1', 'PRAUC'])


'''

model = CatBoostClassifier(**model_params, )

# model = CatBoostClassifier(**model_params, verbose=False)

model.fit(train_data, eval_set=test_data, verbose=False, plot=False)

model.save_model(f"{project_dir}/model/model.cbm")

builtin_metrics = model.eval_metrics(train_data,
                                     metrics=['Logloss', 'AUC', 'F1', 'PRAUC'])

# write results

hold_out_score = model.get_best_score()

# write_eval_summary_file(cv_scores, hold_out_score)

predict_probas = model.predict_proba(test_data)

test_data_metrics = calculate_metrics(build_spec['standard_metrics'],
                                      build_spec['custom_metrics'], y_test,
                                      predict_probas[:, 1])

write_eval_summary_file(cv_scores, test_data_metrics)