Exemplo n.º 1
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2,
                                    random_seed=0,
                                    loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(
        base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(
        base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2,
                               random_seed=0,
                               loss_function="MultiClass")
    data = map_cat_features(pool.get_features(),
                            pool.get_cat_feature_indices())
    model.fit(data,
              pool.get_label(),
              pool.get_cat_feature_indices(),
              sample_weight=np.arange(1,
                                      pool.num_row() + 1),
              baseline=baseline,
              use_best_model=True,
              eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 2
0
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
Exemplo n.º 3
0
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
Exemplo n.º 4
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 5
0
def hp_tuning(train, test, validate, column_description, algo_name, result_path):
    train_pool = Pool(train, column_description=column_description)
    val_pool = Pool(validate, column_description=column_description)
    test_pool = Pool(test, column_description=column_description)
    cat_features = train_pool.get_cat_feature_indices()

    n_estimators = 1000
    max_evals = 50
    task_type = 'CPU'
    
    if algo_name == 'classic':
        func = choose_classic
    elif algo_name == 'hyperopt':
        func = choose_hyperOpt
    elif algo_name == 'gaussian':
        func = choose_gaussian
    elif algo_name == 'optuna':
        func = choose_optuna
    elif algo_name == 'hyperband':
        func = choose_hyperband2
    else:
        print("INCORRECT ALGO NAME!!!")
        return
    start = time.time()
    results = func(train_pool,
                   val_pool,
                   test_pool,
                   cat_features,
                   n_estimators=n_estimators,
                   max_evals=max_evals,
                   task_type=task_type)
    end = time.time()
    results["time"] = end - start
    with open(result_path, 'w') as outfile:
        json.dump(results, outfile)
Exemplo n.º 6
0
def test_export_to_python_after_load():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=40, random_seed=0)
    model.fit(train_pool)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')
    model.save_model(OUTPUT_MODEL_PATH)
    model_loaded = CatBoostClassifier()
    model_loaded.load_model(OUTPUT_MODEL_PATH)
    model_loaded.save_model(OUTPUT_PYTHON_MODEL_PATH,
                            format="python",
                            pool=train_pool)
    pred_model_loaded = model_loaded.predict(test_pool,
                                             prediction_type='RawFormulaVal')
    import sys
    import os.path
    module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH)
    sys.path.insert(0, module_dir)
    from model import apply_catboost_model as apply_catboost_model_from_python
    pred_python = []
    for test_line in test_pool.get_features():
        float_features, cat_features = _split_features(
            test_line, train_pool.get_cat_feature_indices(),
            test_pool.get_cat_feature_hash_to_string())
        pred_python.append(
            apply_catboost_model_from_python(float_features, cat_features))
    assert _check_data(pred_model, pred_python)
    assert _check_data(pred_model_loaded, pred_python)
Exemplo n.º 7
0
def test_load_df_vs_load_from_file():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None, dtype=str)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool1.get_cat_feature_indices()
    pool2 = Pool(np.array(data), label, cat_features)
    assert pool1 == pool2
Exemplo n.º 8
0
def test_load_df_vs_load_from_file():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None, dtype=str)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool1.get_cat_feature_indices()
    pool2 = Pool(np.array(data), label, cat_features)
    assert pool1 == pool2
Exemplo n.º 9
0
def test_load_df():
    pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE)
    data = read_table(NAN_TRAIN_FILE, header=None)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 10
0
def test_load_df():
    pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE)
    data = read_table(NAN_TRAIN_FILE, header=None)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 11
0
def test_load_series():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None)
    label = Series(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    data = Series(list(data.values))
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 12
0
def test_load_series():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None)
    label = Series(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    data = Series(list(data.values))
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 13
0
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(
        FIMP_PATH,
        np.array(
            model.get_feature_importance(
                np.ones(pool.num_col(), dtype=int),
                0,
                cat_features=pool.get_cat_feature_indices(),
                fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
Exemplo n.º 14
0
def test_export_model_with_cat_features_to_python_from_app():
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoost()
    with open(OUTPUT_MODEL_PATH, "w") as model_file:
        model_file.write(resource.find("cb_adult_model_bin"))
    model.load_model(OUTPUT_MODEL_PATH)
    pred_model = model.predict(test_pool, prediction_type='RawFormulaVal')
    from adult_model import apply_catboost_model as apply_catboost_model_from_app
    pred_python = []
    for test_line in test_pool.get_features():
        float_features, cat_features = _split_features(
            test_line, test_pool.get_cat_feature_indices(),
            test_pool.get_cat_feature_hash_to_string())
        pred_python.append(
            apply_catboost_model_from_app(float_features, cat_features))
    assert _check_data(pred_model, pred_python)
Exemplo n.º 15
0
print('测试集 Y_CAT_test  的shape是',Y_CAT_test.shape)
print('训练集中label的正负比例分布如下:\n',Y_CAT_train.value_counts())
print('测试集中label的正负比例分布如下:\n',Y_CAT_test.value_counts())
print('可以看出在划分训练集和测试集时设定strtify参数为Y_CAT;使得在训练测试集中正负例所占比例一致。')

## catboost建模
### Step1: Pool Initialize
from catboost import Pool
pool_data = Pool(data = X_CAT,
           label = Y_CAT,
           cat_features = CAT_features)
print('pool_data的 type 是:', type(pool_data))
print('pool_data的 shpe 是:', pool_data.shape)
print('pool_data.get_features()返回的是list类型,其长度是:',len(pool_data.get_features()))
print('pool_data.get_label()返回的是list类型,其长度是:', len(pool_data.get_label()))
print('pool_data中类别变量所在的索引位置是 pool_data.get_cat_feature_indices() :', pool_data.get_cat_feature_indices())
#print('生成的pool_data的各观测的weight:', pool_data.get_weight())
#print('生成的pool_data的各观测的baseline:', pool_data.get_baseline())


#### Step2.1 自定义metric类。用以做最优模型选择和过拟合检测

# **************Custom metric for overfitting detector and best model selection******
import math
from catboost import Pool, CatBoostClassifier


class Recall_1_Metric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)
Exemplo n.º 16
0
def test_pool_cat_features():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
Exemplo n.º 17
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))
Exemplo n.º 18
0
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
Exemplo n.º 19
0
def test_pool_cat_features():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
Exemplo n.º 20
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))