def test_pool_after_fit(): pool1 = Pool(TRAIN_FILE, column_description=CD_FILE) pool2 = Pool(TRAIN_FILE, column_description=CD_FILE) assert _check_data(pool1.get_features(), pool2.get_features()) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool2) assert _check_data(pool1.get_features(), pool2.get_features())
def test_no_cat_in_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices())) pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices())) assert _check_data(pred1, pred2)
def test_load_df(): pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE) data = read_table(NAN_TRAIN_FILE, header=None) label = DataFrame(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_load_series(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) data = read_table(TRAIN_FILE, header=None) label = Series(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) data = Series(list(data.values)) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_load_dumps(): pool_size = (100, 10) data = np.random.randint(10, size=pool_size) label = np.random.randint(2, size=pool_size[0]) pool1 = Pool(data, label) lines = [] for i in range(len(data)): line = [str(label[i])] + [str(x) for x in data[i]] lines.append('\t'.join(line)) text = '\n'.join(lines) with open('test_data_dumps', 'w') as f: f.write(text) pool2 = Pool('test_data_dumps') assert _check_data(pool1.get_features(), pool2.get_features()) assert _check_data(pool1.get_label(), pool2.get_label())
def test_load_generated(): pool_size = (100, 10) data = np.round(np.random.normal(size=pool_size), decimals=3) label = np.random.randint(2, size=pool_size[0]) pool = Pool(data, label) assert _check_data(pool.get_features(), data) assert _check_data(pool.get_label(), label)
def test_non_ones_weight(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) pool2 = Pool(pool.get_features(), pool.get_label(), weight=np.arange(1, pool.num_row()+1)) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool2) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
def test_export_to_python_after_load(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=40, random_seed=0) model.fit(train_pool) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') model.save_model(OUTPUT_MODEL_PATH) model_loaded = CatBoostClassifier() model_loaded.load_model(OUTPUT_MODEL_PATH) model_loaded.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python", pool=train_pool) pred_model_loaded = model_loaded.predict(test_pool, prediction_type='RawFormulaVal') import sys import os.path module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH) sys.path.insert(0, module_dir) from model import apply_catboost_model as apply_catboost_model_from_python pred_python = [] for test_line in test_pool.get_features(): float_features, cat_features = _split_features( test_line, train_pool.get_cat_feature_indices(), test_pool.get_cat_feature_hash_to_string()) pred_python.append( apply_catboost_model_from_python(float_features, cat_features)) assert _check_data(pred_model, pred_python) assert _check_data(pred_model_loaded, pred_python)
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array( base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array( base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row() + 1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_zero_baseline(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) baseline = np.zeros((pool.num_row(), 2)) pool = Pool(pool.get_features(), pool.get_label(), baseline=baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
def test_non_zero_bazeline(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool)) pool2 = Pool(pool.get_features(), pool.get_label(), baseline=baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool2) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_export_model_with_only_float_features_to_python_from_python(): train_pool = Pool(HIGGS_TRAIN_FILE, column_description=HIGGS_CD_FILE) test_pool = Pool(HIGGS_TEST_FILE, column_description=HIGGS_CD_FILE) model = CatBoost({'iterations': 30, 'random_seed': 0}) model.fit(train_pool) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python") import sys import os.path module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH) sys.path.insert(0, module_dir) from model import apply_catboost_model as apply_catboost_model_from_python pred_python = [] for float_features in test_pool.get_features(): pred_python.append(apply_catboost_model_from_python(float_features)) assert _check_data(pred_model, pred_python)
def test_export_model_with_cat_features_to_python_from_app(): test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost() with open(OUTPUT_MODEL_PATH, "w") as model_file: model_file.write(resource.find("cb_adult_model_bin")) model.load_model(OUTPUT_MODEL_PATH) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') from adult_model import apply_catboost_model as apply_catboost_model_from_app pred_python = [] for test_line in test_pool.get_features(): float_features, cat_features = _split_features( test_line, test_pool.get_cat_feature_indices(), test_pool.get_cat_feature_hash_to_string()) pred_python.append( apply_catboost_model_from_app(float_features, cat_features)) assert _check_data(pred_model, pred_python)
def score(space): # print("Training with params: ") print(params) params = get_catboost_params(space) # num_round = int(params['n_estimators']) # del params['n_estimators'] # dtrain = xgb.DMatrix(X_train, label=y_train["breed_category"]) # dvalid = xgb.DMatrix(X_test, label=y_test["breed_category"]) # watchlist = [(dvalid, 'eval'), (dtrain, 'train')] # gbm_model = xgb.train(params, dtrain, num_round, # evals=watchlist, # verbose_eval=True) dtrain = Pool(X_train, label=y_train) dvalid = Pool(X_test, label=y_test) model = CatBoostClassifier(iterations=100000, learning_rate=params['learning_rate'], depth=int(params['depth']), loss_function='CrossEntropy', use_best_model=True, task_type="CPU", eval_metric='AUC', classes_count=4, l2_leaf_reg=params['l2_leaf_reg'], early_stopping_rounds=3000, od_type="Iter", border_count=int(params['border_count']), verbose=False) model.fit(dtrain, eval_set=dvalid, verbose=False) # prinpredictions.shape) predictions = model.predict(dvalid.get_features()) # print(Counter(y_test["pet_category"]), Counter(predictions)) # lb = LabelBinarizer() # lb.fit(y_test["breed_category"]) # y_true_bin = lb.transform(y_test["breed_category"]) # y_pred_bin = lb.transform(predictions) score = f1_score(y_test["pet_category"], predictions, average="weighted") # TODO: Add the importance for the selected features print("\tScore {0}\n\n".format(score)) # The score function should return the loss (1-score) # since the optimize function looks for the minimum loss = 1 - score return {'loss': loss, 'status': STATUS_OK}
def test_load_ndarray(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cat_features = pool.get_cat_feature_indices() data = np.array(map_cat_features(pool.get_features(), cat_features)) label = np.array(pool.get_label()) assert _check_shape(Pool(data, label, cat_features))
print('测试集 X_CAT_test 的shape是',X_CAT_test.shape) print('训练集 Y_CAT_train 的shape是',Y_CAT_train.shape) print('测试集 Y_CAT_test 的shape是',Y_CAT_test.shape) print('训练集中label的正负比例分布如下:\n',Y_CAT_train.value_counts()) print('测试集中label的正负比例分布如下:\n',Y_CAT_test.value_counts()) print('可以看出在划分训练集和测试集时设定strtify参数为Y_CAT;使得在训练测试集中正负例所占比例一致。') ## catboost建模 ### Step1: Pool Initialize from catboost import Pool pool_data = Pool(data = X_CAT, label = Y_CAT, cat_features = CAT_features) print('pool_data的 type 是:', type(pool_data)) print('pool_data的 shpe 是:', pool_data.shape) print('pool_data.get_features()返回的是list类型,其长度是:',len(pool_data.get_features())) print('pool_data.get_label()返回的是list类型,其长度是:', len(pool_data.get_label())) print('pool_data中类别变量所在的索引位置是 pool_data.get_cat_feature_indices() :', pool_data.get_cat_feature_indices()) #print('生成的pool_data的各观测的weight:', pool_data.get_weight()) #print('生成的pool_data的各观测的baseline:', pool_data.get_baseline()) #### Step2.1 自定义metric类。用以做最优模型选择和过拟合检测 # **************Custom metric for overfitting detector and best model selection****** import math from catboost import Pool, CatBoostClassifier class Recall_1_Metric(object): def get_final_error(self, error, weight):
def test_fit_no_label(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool.get_features())
def catboost_bootstrap(dir_, learn_name, test_name, cd_file, classes, learning_rate=None, border_count=32, cnt_values=20, file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500): logloss = {} auc = {} for clazz in classes: print('class={}'.format(clazz.WRAPPER_NAME)) print('class={}; step={}'.format(clazz.WRAPPER_NAME, learning_rate[clazz]), file=file_result_to) file_result_to.flush() auc[clazz.WRAPPER_NAME] = [] logloss[clazz.WRAPPER_NAME] = [] tree_counts = [] logloss_curves = [] auc_curves = [] cl = clazz() source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join( dir_, cd_file)) beg = time.time() learn_pool = cl.handle_learn_pool(source_learn_pool) end = time.time() print('!!!time: {}'.format(end - beg), file=file_info_to) print('priors: {}'.format(cl.prior), file=file_info_to) print('prior scores: {}'.format(cl.score), file=file_info_to) file_info_to.flush() source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file)) source_test_label = np.array(source_test_pool.get_label()) source_test_features = np.array(source_test_pool.get_features()) cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=0, border_count=border_count, iterations=iterations, learning_rate=learning_rate[clazz], thread_count=multiprocessing.cpu_count()) beg = time.time() cat.fit(learn_pool, use_best_model=True) end = time.time() for seed in range(cnt_values): idx = list(range(source_test_features.shape[0])) np.random.seed(seed * 10 + 300) boot_idx = np.random.choice(idx, len(idx), replace=True) boot_test_features = source_test_features[boot_idx] boot_test_label = source_test_label[boot_idx] X, y = cl.handle_test_matrix(boot_test_features, boot_test_label, False) metrics = cat.eval_metrics( Pool(X, y), ['Logloss', 'AUC'], eval_period=1, thread_count=multiprocessing.cpu_count()) for num, loss in enumerate(metrics['Logloss']): print('iter={:10}: loss={:.10}'.format(num + 1, loss)) cnt_trees = np.argmin(metrics['Logloss']) print('choose cnt_trees={}'.format(cnt_trees)) print('overfit={}; AUC={}; logloss={}'.format( cnt_trees, metrics['AUC'][cnt_trees], metrics['Logloss'][cnt_trees]), file=file_result_to) tree_counts.append(cnt_trees) file_result_to.flush() logloss_curves.append(metrics['Logloss']) auc_curves.append(metrics['AUC']) auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees]) logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees]) print('class={}, learn_time={}, mean_tree_count={}'.format( clazz.WRAPPER_NAME, end - beg, sum(tree_counts) / len(tree_counts)), file=file_result_to) print('mean_AUC={}, mean_logloss={}'.format( sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]), sum(logloss[clazz.WRAPPER_NAME]) / len(logloss[clazz.WRAPPER_NAME])), file=file_result_to) file_result_to.flush() logloss_fig = create_learning_curves_plot( logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME)) auc_fig = create_learning_curves_plot( auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME)) logloss_file = os.path.join( dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME)) AUC_file = os.path.join(dir_, 'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME)) plot(logloss_fig, filename=logloss_file, auto_open=False) plot(auc_fig, filename=AUC_file, auto_open=False) file_name = os.path.join(dir_, 'boot.txt') with open(file_name, 'w') as file_to: json.dump(auc, file_to) for cl1 in classes: for cl2 in classes: stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME], auc[cl2.WRAPPER_NAME], zero_method="pratt") print('for {} & {}: stat: {}, p_value: {}'.format( cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value), file=file_result_to)
def catboost_test(dir_, cur_learn_name, cur_test_name, clazz, learning_rate=None, border_count=128, cnt_models=1, file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500): full_learn_name = os.path.join(dir_, cur_learn_name) full_test_name = os.path.join(dir_, cur_test_name) if not os.path.exists(full_learn_name): source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join(dir_, cd_file)) source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file)) cl = clazz() beg = time.time() learn_pool = cl.handle_learn_pool(source_learn_pool) test_pool = cl.handle_test_pool(source_test_pool) end = time.time() print('!!!time: {}'.format(end - beg), file=file_info_to) print('priors: {}'.format(cl.prior), file=file_info_to) print('prior scores: {}'.format(cl.score), file=file_info_to) file_info_to.flush() learn_label = learn_pool.get_label() learn_features = learn_pool.get_features() learn_data = np.zeros((len(learn_label), len(learn_features[0]) + 1)) learn_data[:, 0] = learn_label learn_data[:, 1:] = learn_features np.savetxt(full_learn_name, learn_data, delimiter='\t', fmt='%.10f') test_label = test_pool.get_label() test_features = test_pool.get_features() test_data = np.zeros((len(test_label), len(test_features[0]) + 1)) test_data[:, 0] = test_label test_data[:, 1:] = test_features np.savetxt(full_test_name, test_data, delimiter='\t', fmt='%.10f') learn_pool = Pool(data=full_learn_name) test_pool = Pool(data=full_test_name) scores = [] auc = [] logloss = [] times =[] tree_counts = [] for seed in range(cnt_models): print(seed) # print(len(learn_pool.get_features()), len(learn_pool.get_features()[0])) # print(len(test_pool.get_features()), len(test_pool.get_features()[0])) beg = time.time() cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=seed, border_count=border_count, iterations=iterations, learning_rate=learning_rate, thread_count=multiprocessing.cpu_count()) cat.fit(learn_pool, eval_set=(test_pool), use_best_model=True) end = time.time() X_test = test_pool.get_features() y_test = test_pool.get_label() tree_counts.append(cat.tree_count_) scores.append(cat.score(X_test, y_test)) metrics = cat.eval_metrics(test_pool, ['AUC', 'Logloss'], eval_period=cat.tree_count_ - 1) print('overfit={}; acc={}; AUC={}; logloss={}; learn_time={}'.format(cat.tree_count_, scores[-1], metrics['AUC'][1], metrics['Logloss'][1], end - beg), file=file_result_to) file_result_to.flush() auc.append(metrics['AUC'][1]) logloss.append(metrics['Logloss'][1]) times.append(end - beg) if len(tree_counts) != 0: print('mean tree_count: {}'.format(sum(tree_counts)/len(tree_counts)), file=file_result_to) return sum(scores)/len(scores), sum(auc)/len(auc), sum(logloss)/len(logloss), sum(times)/len(times) else: return 0, 0, 0, 0
class CatBoostModel: def __init__(self): self._preprocess = pp.Preprocess() self._model = CatBoostClassifier() self._model.load_model("models/Saving/CBmodel.cbm") self.x = self.y = 0 self._train_data = None self._test_data = None @property def model(self): return self._model def set_new_model(self, cbm_model=""): if cbm_model == '': raise IOError("No path to model") self._model.load_model(cbm_model) def set_pool(self, path_to_dataset='', test_size=0.3): if path_to_dataset != '': self._preprocess.set_dataset(path_to_dataset) self.x, self.y = self._preprocess.process_data_for_gradient_with_label( ) x_train, x_test, y_train, y_test = train_test_split( self.x, self.y, test_size=test_size, random_state=42) self._train_data = Pool(x_train, y_train) self._test_data = Pool(x_test, y_test) def get_predict_with_label(self, path_to_data=''): if path_to_data == '': raise IOError("No path to data") self._preprocess.set_dataset(path_to_data) self.x, self.y = self._preprocess.process_data_for_gradient_with_label( ) return self._model.predict(self.x) def relearn_model(self, path_to_dataset='', test_size=0.3): if path_to_dataset == '': raise IOError("No path to dataset") self.set_pool(path_to_dataset=path_to_dataset, test_size=test_size) self._model = CatBoostClassifier(iterations=200, depth=2, learning_rate=0.4, loss_function='Logloss', verbose=False) self._model.fit(self._train_data, plot=True) def get_test_accuracy(self): return accuracy_score( self._test_data.get_label(), self._model.predict(self._test_data.get_features())) def get_test_auc(self): return roc_auc_score( self._test_data.get_label(), self._model.predict_proba(self._test_data.get_features())[:, 1]) def get_predict_unknown(self, path_to_data=''): if path_to_data == '': raise IOError("No path to data") self._preprocess.set_dataset(path_to_data) self.x = self._preprocess.get_data_for_predict_gradient() return self._model.predict(self.x)