Exemplo n.º 1
0
def test_pool_after_fit():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert _check_data(pool1.get_features(), pool2.get_features())
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool2)
    assert _check_data(pool1.get_features(), pool2.get_features())
Exemplo n.º 2
0
def test_pool_after_fit():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert _check_data(pool1.get_features(), pool2.get_features())
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool2)
    assert _check_data(pool1.get_features(), pool2.get_features())
Exemplo n.º 3
0
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
Exemplo n.º 4
0
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
Exemplo n.º 5
0
def test_load_df():
    pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE)
    data = read_table(NAN_TRAIN_FILE, header=None)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 6
0
def test_load_df():
    pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE)
    data = read_table(NAN_TRAIN_FILE, header=None)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 7
0
def test_load_series():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None)
    label = Series(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    data = Series(list(data.values))
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 8
0
def test_load_series():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None)
    label = Series(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    data = Series(list(data.values))
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 9
0
def test_load_dumps():
    pool_size = (100, 10)
    data = np.random.randint(10, size=pool_size)
    label = np.random.randint(2, size=pool_size[0])
    pool1 = Pool(data, label)
    lines = []
    for i in range(len(data)):
        line = [str(label[i])] + [str(x) for x in data[i]]
        lines.append('\t'.join(line))
    text = '\n'.join(lines)
    with open('test_data_dumps', 'w') as f:
        f.write(text)
    pool2 = Pool('test_data_dumps')
    assert _check_data(pool1.get_features(), pool2.get_features())
    assert _check_data(pool1.get_label(), pool2.get_label())
Exemplo n.º 10
0
def test_load_dumps():
    pool_size = (100, 10)
    data = np.random.randint(10, size=pool_size)
    label = np.random.randint(2, size=pool_size[0])
    pool1 = Pool(data, label)
    lines = []
    for i in range(len(data)):
        line = [str(label[i])] + [str(x) for x in data[i]]
        lines.append('\t'.join(line))
    text = '\n'.join(lines)
    with open('test_data_dumps', 'w') as f:
        f.write(text)
    pool2 = Pool('test_data_dumps')
    assert _check_data(pool1.get_features(), pool2.get_features())
    assert _check_data(pool1.get_label(), pool2.get_label())
Exemplo n.º 11
0
def test_load_generated():
    pool_size = (100, 10)
    data = np.round(np.random.normal(size=pool_size), decimals=3)
    label = np.random.randint(2, size=pool_size[0])
    pool = Pool(data, label)
    assert _check_data(pool.get_features(), data)
    assert _check_data(pool.get_label(), label)
Exemplo n.º 12
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(pool.get_features(), pool.get_label(), weight=np.arange(1, pool.num_row()+1))
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool2)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 13
0
def test_export_to_python_after_load():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=40, random_seed=0)
    model.fit(train_pool)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')
    model.save_model(OUTPUT_MODEL_PATH)
    model_loaded = CatBoostClassifier()
    model_loaded.load_model(OUTPUT_MODEL_PATH)
    model_loaded.save_model(OUTPUT_PYTHON_MODEL_PATH,
                            format="python",
                            pool=train_pool)
    pred_model_loaded = model_loaded.predict(test_pool,
                                             prediction_type='RawFormulaVal')
    import sys
    import os.path
    module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH)
    sys.path.insert(0, module_dir)
    from model import apply_catboost_model as apply_catboost_model_from_python
    pred_python = []
    for test_line in test_pool.get_features():
        float_features, cat_features = _split_features(
            test_line, train_pool.get_cat_feature_indices(),
            test_pool.get_cat_feature_hash_to_string())
        pred_python.append(
            apply_catboost_model_from_python(float_features, cat_features))
    assert _check_data(pred_model, pred_python)
    assert _check_data(pred_model_loaded, pred_python)
Exemplo n.º 14
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2,
                                    random_seed=0,
                                    loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(
        base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(
        base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2,
                               random_seed=0,
                               loss_function="MultiClass")
    data = map_cat_features(pool.get_features(),
                            pool.get_cat_feature_indices())
    model.fit(data,
              pool.get_label(),
              pool.get_cat_feature_indices(),
              sample_weight=np.arange(1,
                                      pool.num_row() + 1),
              baseline=baseline,
              use_best_model=True,
              eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 15
0
def test_load_generated():
    pool_size = (100, 10)
    data = np.round(np.random.normal(size=pool_size), decimals=3)
    label = np.random.randint(2, size=pool_size[0])
    pool = Pool(data, label)
    assert _check_data(pool.get_features(), data)
    assert _check_data(pool.get_label(), label)
Exemplo n.º 16
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros((pool.num_row(), 2))
    pool = Pool(pool.get_features(), pool.get_label(), baseline=baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 17
0
def test_non_zero_bazeline():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool))
    pool2 = Pool(pool.get_features(), pool.get_label(), baseline=baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool2)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 18
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 19
0
def test_export_model_with_only_float_features_to_python_from_python():
    train_pool = Pool(HIGGS_TRAIN_FILE, column_description=HIGGS_CD_FILE)
    test_pool = Pool(HIGGS_TEST_FILE, column_description=HIGGS_CD_FILE)
    model = CatBoost({'iterations': 30, 'random_seed': 0})
    model.fit(train_pool)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')
    model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python")
    import sys
    import os.path
    module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH)
    sys.path.insert(0, module_dir)
    from model import apply_catboost_model as apply_catboost_model_from_python
    pred_python = []
    for float_features in test_pool.get_features():
        pred_python.append(apply_catboost_model_from_python(float_features))
    assert _check_data(pred_model, pred_python)
Exemplo n.º 20
0
def test_export_model_with_cat_features_to_python_from_app():
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoost()
    with open(OUTPUT_MODEL_PATH, "w") as model_file:
        model_file.write(resource.find("cb_adult_model_bin"))
    model.load_model(OUTPUT_MODEL_PATH)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')
    from adult_model import apply_catboost_model as apply_catboost_model_from_app
    pred_python = []
    for test_line in test_pool.get_features():
        float_features, cat_features = _split_features(
            test_line, test_pool.get_cat_feature_indices(),
            test_pool.get_cat_feature_hash_to_string())
        pred_python.append(
            apply_catboost_model_from_app(float_features, cat_features))
    assert _check_data(pred_model, pred_python)
Exemplo n.º 21
0
def score(space):
    # print("Training with params: ")
    print(params)
    params = get_catboost_params(space)
    # num_round = int(params['n_estimators'])
    # del params['n_estimators']
    # dtrain = xgb.DMatrix(X_train, label=y_train["breed_category"])
    # dvalid = xgb.DMatrix(X_test, label=y_test["breed_category"])
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    # gbm_model = xgb.train(params, dtrain, num_round,
    #                       evals=watchlist,
    #                       verbose_eval=True)
    dtrain = Pool(X_train, label=y_train)
    dvalid = Pool(X_test, label=y_test)
    model = CatBoostClassifier(iterations=100000,
                               learning_rate=params['learning_rate'],
                               depth=int(params['depth']),
                               loss_function='CrossEntropy',
                               use_best_model=True,
                               task_type="CPU",
                               eval_metric='AUC',
                               classes_count=4,
                               l2_leaf_reg=params['l2_leaf_reg'],
                               early_stopping_rounds=3000,
                               od_type="Iter",
                               border_count=int(params['border_count']),
                               verbose=False)
    model.fit(dtrain, eval_set=dvalid, verbose=False)
    # prinpredictions.shape)
    predictions = model.predict(dvalid.get_features())
    # print(Counter(y_test["pet_category"]), Counter(predictions))
    # lb = LabelBinarizer()
    # lb.fit(y_test["breed_category"])
    # y_true_bin = lb.transform(y_test["breed_category"])
    # y_pred_bin = lb.transform(predictions)

    score = f1_score(y_test["pet_category"], predictions, average="weighted")
    # TODO: Add the importance for the selected features
    print("\tScore {0}\n\n".format(score))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}
Exemplo n.º 22
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))
Exemplo n.º 23
0
print('测试集 X_CAT_test  的shape是',X_CAT_test.shape)
print('训练集 Y_CAT_train 的shape是',Y_CAT_train.shape)
print('测试集 Y_CAT_test  的shape是',Y_CAT_test.shape)
print('训练集中label的正负比例分布如下:\n',Y_CAT_train.value_counts())
print('测试集中label的正负比例分布如下:\n',Y_CAT_test.value_counts())
print('可以看出在划分训练集和测试集时设定strtify参数为Y_CAT;使得在训练测试集中正负例所占比例一致。')

## catboost建模
### Step1: Pool Initialize
from catboost import Pool
pool_data = Pool(data = X_CAT,
           label = Y_CAT,
           cat_features = CAT_features)
print('pool_data的 type 是:', type(pool_data))
print('pool_data的 shpe 是:', pool_data.shape)
print('pool_data.get_features()返回的是list类型,其长度是:',len(pool_data.get_features()))
print('pool_data.get_label()返回的是list类型,其长度是:', len(pool_data.get_label()))
print('pool_data中类别变量所在的索引位置是 pool_data.get_cat_feature_indices() :', pool_data.get_cat_feature_indices())
#print('生成的pool_data的各观测的weight:', pool_data.get_weight())
#print('生成的pool_data的各观测的baseline:', pool_data.get_baseline())


#### Step2.1 自定义metric类。用以做最优模型选择和过拟合检测

# **************Custom metric for overfitting detector and best model selection******
import math
from catboost import Pool, CatBoostClassifier


class Recall_1_Metric(object):
    def get_final_error(self, error, weight):
Exemplo n.º 24
0
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
Exemplo n.º 25
0
def catboost_bootstrap(dir_,
                       learn_name,
                       test_name,
                       cd_file,
                       classes,
                       learning_rate=None,
                       border_count=32,
                       cnt_values=20,
                       file_result_to=sys.stdout,
                       file_info_to=sys.stdout,
                       iterations=1500):
    logloss = {}
    auc = {}
    for clazz in classes:
        print('class={}'.format(clazz.WRAPPER_NAME))
        print('class={}; step={}'.format(clazz.WRAPPER_NAME,
                                         learning_rate[clazz]),
              file=file_result_to)
        file_result_to.flush()
        auc[clazz.WRAPPER_NAME] = []
        logloss[clazz.WRAPPER_NAME] = []
        tree_counts = []
        logloss_curves = []
        auc_curves = []

        cl = clazz()
        source_learn_pool = Pool(data=os.path.join(dir_, learn_name),
                                 column_description=os.path.join(
                                     dir_, cd_file))
        beg = time.time()
        learn_pool = cl.handle_learn_pool(source_learn_pool)
        end = time.time()
        print('!!!time: {}'.format(end - beg), file=file_info_to)
        print('priors: {}'.format(cl.prior), file=file_info_to)
        print('prior scores: {}'.format(cl.score), file=file_info_to)
        file_info_to.flush()

        source_test_pool = Pool(data=os.path.join(dir_, test_name),
                                column_description=os.path.join(dir_, cd_file))
        source_test_label = np.array(source_test_pool.get_label())
        source_test_features = np.array(source_test_pool.get_features())

        cat = CatBoostClassifier(max_ctr_complexity=1,
                                 custom_metric='AUC',
                                 boosting_type='Plain',
                                 random_seed=0,
                                 border_count=border_count,
                                 iterations=iterations,
                                 learning_rate=learning_rate[clazz],
                                 thread_count=multiprocessing.cpu_count())
        beg = time.time()
        cat.fit(learn_pool, use_best_model=True)
        end = time.time()

        for seed in range(cnt_values):
            idx = list(range(source_test_features.shape[0]))
            np.random.seed(seed * 10 + 300)
            boot_idx = np.random.choice(idx, len(idx), replace=True)
            boot_test_features = source_test_features[boot_idx]
            boot_test_label = source_test_label[boot_idx]
            X, y = cl.handle_test_matrix(boot_test_features, boot_test_label,
                                         False)
            metrics = cat.eval_metrics(
                Pool(X, y), ['Logloss', 'AUC'],
                eval_period=1,
                thread_count=multiprocessing.cpu_count())
            for num, loss in enumerate(metrics['Logloss']):
                print('iter={:10}:     loss={:.10}'.format(num + 1, loss))
            cnt_trees = np.argmin(metrics['Logloss'])
            print('choose cnt_trees={}'.format(cnt_trees))
            print('overfit={}; AUC={}; logloss={}'.format(
                cnt_trees, metrics['AUC'][cnt_trees],
                metrics['Logloss'][cnt_trees]),
                  file=file_result_to)
            tree_counts.append(cnt_trees)
            file_result_to.flush()
            logloss_curves.append(metrics['Logloss'])
            auc_curves.append(metrics['AUC'])
            auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees])
            logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees])

        print('class={}, learn_time={}, mean_tree_count={}'.format(
            clazz.WRAPPER_NAME, end - beg,
            sum(tree_counts) / len(tree_counts)),
              file=file_result_to)
        print('mean_AUC={}, mean_logloss={}'.format(
            sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]),
            sum(logloss[clazz.WRAPPER_NAME]) /
            len(logloss[clazz.WRAPPER_NAME])),
              file=file_result_to)
        file_result_to.flush()

        logloss_fig = create_learning_curves_plot(
            logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME))
        auc_fig = create_learning_curves_plot(
            auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME))
        logloss_file = os.path.join(
            dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME))
        AUC_file = os.path.join(dir_,
                                'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME))
        plot(logloss_fig, filename=logloss_file, auto_open=False)
        plot(auc_fig, filename=AUC_file, auto_open=False)

    file_name = os.path.join(dir_, 'boot.txt')
    with open(file_name, 'w') as file_to:
        json.dump(auc, file_to)

    for cl1 in classes:
        for cl2 in classes:
            stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME],
                                     auc[cl2.WRAPPER_NAME],
                                     zero_method="pratt")
            print('for {} & {}: stat: {}, p_value: {}'.format(
                cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value),
                  file=file_result_to)
Exemplo n.º 26
0
def catboost_test(dir_, cur_learn_name, cur_test_name, clazz, learning_rate=None, border_count=128, cnt_models=1,
                  file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500):
    full_learn_name = os.path.join(dir_, cur_learn_name)
    full_test_name = os.path.join(dir_, cur_test_name)

    if not os.path.exists(full_learn_name):
        source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join(dir_, cd_file))
        source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file))
        cl = clazz()
        beg = time.time()
        learn_pool = cl.handle_learn_pool(source_learn_pool)
        test_pool = cl.handle_test_pool(source_test_pool)
        end = time.time()
        print('!!!time: {}'.format(end - beg), file=file_info_to)
        print('priors: {}'.format(cl.prior), file=file_info_to)
        print('prior scores: {}'.format(cl.score), file=file_info_to)
        file_info_to.flush()
        learn_label = learn_pool.get_label()
        learn_features = learn_pool.get_features()
        learn_data = np.zeros((len(learn_label), len(learn_features[0]) + 1))
        learn_data[:, 0] = learn_label
        learn_data[:, 1:] = learn_features
        np.savetxt(full_learn_name, learn_data, delimiter='\t', fmt='%.10f')
        test_label = test_pool.get_label()
        test_features = test_pool.get_features()
        test_data = np.zeros((len(test_label), len(test_features[0]) + 1))
        test_data[:, 0] = test_label
        test_data[:, 1:] = test_features
        np.savetxt(full_test_name, test_data, delimiter='\t', fmt='%.10f')

    learn_pool = Pool(data=full_learn_name)
    test_pool = Pool(data=full_test_name)

    scores = []
    auc = []
    logloss = []
    times =[]
    tree_counts = []
    for seed in range(cnt_models):
        print(seed)
        # print(len(learn_pool.get_features()), len(learn_pool.get_features()[0]))
        # print(len(test_pool.get_features()), len(test_pool.get_features()[0]))
        beg = time.time()
        cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=seed, border_count=border_count, iterations=iterations, learning_rate=learning_rate, thread_count=multiprocessing.cpu_count())
        cat.fit(learn_pool, eval_set=(test_pool), use_best_model=True)
        end = time.time()
        X_test = test_pool.get_features()
        y_test = test_pool.get_label()

        tree_counts.append(cat.tree_count_)
        scores.append(cat.score(X_test, y_test))
        metrics = cat.eval_metrics(test_pool, ['AUC', 'Logloss'], eval_period=cat.tree_count_ - 1)
        print('overfit={}; acc={}; AUC={}; logloss={}; learn_time={}'.format(cat.tree_count_, scores[-1], metrics['AUC'][1], metrics['Logloss'][1], end - beg), file=file_result_to)
        file_result_to.flush()
        auc.append(metrics['AUC'][1])
        logloss.append(metrics['Logloss'][1])
        times.append(end - beg)
    if len(tree_counts) != 0:
        print('mean tree_count: {}'.format(sum(tree_counts)/len(tree_counts)), file=file_result_to)
        return sum(scores)/len(scores), sum(auc)/len(auc), sum(logloss)/len(logloss), sum(times)/len(times)
    else:
        return 0, 0, 0, 0
Exemplo n.º 27
0
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
Exemplo n.º 28
0
class CatBoostModel:
    def __init__(self):
        self._preprocess = pp.Preprocess()
        self._model = CatBoostClassifier()
        self._model.load_model("models/Saving/CBmodel.cbm")
        self.x = self.y = 0
        self._train_data = None
        self._test_data = None

    @property
    def model(self):
        return self._model

    def set_new_model(self, cbm_model=""):
        if cbm_model == '':
            raise IOError("No path to model")
        self._model.load_model(cbm_model)

    def set_pool(self, path_to_dataset='', test_size=0.3):

        if path_to_dataset != '':
            self._preprocess.set_dataset(path_to_dataset)
            self.x, self.y = self._preprocess.process_data_for_gradient_with_label(
            )

        x_train, x_test, y_train, y_test = train_test_split(
            self.x, self.y, test_size=test_size, random_state=42)
        self._train_data = Pool(x_train, y_train)
        self._test_data = Pool(x_test, y_test)

    def get_predict_with_label(self, path_to_data=''):
        if path_to_data == '':
            raise IOError("No path to data")
        self._preprocess.set_dataset(path_to_data)
        self.x, self.y = self._preprocess.process_data_for_gradient_with_label(
        )
        return self._model.predict(self.x)

    def relearn_model(self, path_to_dataset='', test_size=0.3):
        if path_to_dataset == '':
            raise IOError("No path to dataset")
        self.set_pool(path_to_dataset=path_to_dataset, test_size=test_size)
        self._model = CatBoostClassifier(iterations=200,
                                         depth=2,
                                         learning_rate=0.4,
                                         loss_function='Logloss',
                                         verbose=False)
        self._model.fit(self._train_data, plot=True)

    def get_test_accuracy(self):
        return accuracy_score(
            self._test_data.get_label(),
            self._model.predict(self._test_data.get_features()))

    def get_test_auc(self):
        return roc_auc_score(
            self._test_data.get_label(),
            self._model.predict_proba(self._test_data.get_features())[:, 1])

    def get_predict_unknown(self, path_to_data=''):
        if path_to_data == '':
            raise IOError("No path to data")
        self._preprocess.set_dataset(path_to_data)
        self.x = self._preprocess.get_data_for_predict_gradient()
        return self._model.predict(self.x)
Exemplo n.º 29
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))