def test_shap(): train_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 5, 8], cat_features=[]) test_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]]) model = CatBoostRegressor(iterations=1, random_seed=0, max_ctr_complexity=1, depth=2) model.fit(train_pool) shap_values = model.get_feature_importance(test_pool, fstr_type='ShapValues') dataset = [(0.5, 1.2), (1.6, 0.5), (1.8, 1.0), (0.4, 0.6), (0.3, 1.6), (1.5, 0.2)] labels = [1.1, 1.85, 2.3, 0.7, 1.1, 1.6] train_pool = Pool(dataset, labels, cat_features=[]) model = CatBoost({'iterations': 10, 'random_seed': 0, 'max_ctr_complexity': 1}) model.fit(train_pool) testset = [(0.6, 1.2), (1.4, 0.3), (1.5, 0.8), (1.4, 0.6)] predictions = model.predict(testset) shap_values = model.get_feature_importance(Pool(testset), fstr_type='ShapValues') assert(len(predictions) == len(shap_values)) for pred_idx in range(len(predictions)): assert(abs(sum(shap_values[pred_idx]) - predictions[pred_idx]) < 1e-9) with open(FIMP_PATH, 'w') as out: out.write(shap_values) local_canonical_file(FIMP_PATH)
def test_predict_class(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred = model.predict(test_pool, prediction_type="Class") np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_ntree_limit(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=100, random_seed=0) model.fit(train_pool) pred = model.predict_proba(test_pool, ntree_end=10) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_multiclass(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8) classifier.fit(pool) classifier.save_model(OUTPUT_MODEL_PATH) new_classifier = CatBoostClassifier() new_classifier.load_model(OUTPUT_MODEL_PATH) pred = new_classifier.predict_proba(pool) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_staged_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=10, random_seed=0) model.fit(train_pool) preds = [] for pred in model.staged_predict(test_pool): preds.append(pred) np.save(PREDS_PATH, np.array(preds)) return local_canonical_file(PREDS_PATH)
def test_object_importances(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost({'loss_function': 'RMSE', 'iterations': 10, 'random_seed': 0}) model.fit(train_pool) indices, scores = model.get_object_importance(pool, train_pool, top_size=10) np.savetxt(OIMP_PATH, scores) return local_canonical_file(OIMP_PATH)
def test_coreml_import_export(): train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE) test_pool = Pool(QUERYWISE_TEST_FILE, column_description=QUERYWISE_CD_FILE) model = CatBoost(params={'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 20, 'thread_count': 8}) model.fit(train_pool) model.save_model(OUTPUT_COREML_MODEL_PATH, format="coreml") canon_pred = model.predict(test_pool) coreml_loaded_model = CatBoostRegressor() coreml_loaded_model.load_model(OUTPUT_COREML_MODEL_PATH, format="coreml") assert all(canon_pred == coreml_loaded_model.predict(test_pool)) return local_canonical_file(OUTPUT_COREML_MODEL_PATH)
def test_verbose_int(verbose): expected_line_count = {5: 3, False: 0, True: 10} pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == expected_line_count[verbose]) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == expected_line_count[verbose]) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_eval_set(): dataset = [(1, 2, 3, 4), (2, 2, 3, 4), (3, 2, 3, 4), (4, 2, 3, 4)] labels = [1, 2, 3, 4] train_pool = Pool(dataset, labels, cat_features=[0, 3, 2]) model = CatBoost({'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 0}) eval_dataset = [(5, 6, 6, 6), (6, 6, 6, 6)] eval_labels = [5, 6] eval_pool = (eval_dataset, eval_labels) model.fit(train_pool, eval_set=eval_pool) eval_pools = [eval_pool] model.fit(train_pool, eval_set=eval_pools) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_weights_without_bootstrap(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_weight', 'train.cd') test_file = data_file('adult_weight', 'test_weight') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '-f': data_file('adult_weight', 'train_weight'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '10', '-w': '0.03', '-T': '4', '-r': '0', '--bootstrap-type': 'No', '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path, diff_tool=diff_tool())]
def test_logloss_with_not_binarized_target(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_not_binarized', 'train.cd') test_file = data_file('adult_not_binarized', 'test_small') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '-f': data_file('adult_not_binarized', 'train_small'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '10', '-w': '0.03', '-T': '4', '-r': '0', '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_ctr_type(ctr_type, boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_crossentropy', 'train.cd') test_file = data_file('adult_crossentropy', 'test_proba') params = ( '--use-best-model', 'false', '--loss-function', 'RMSE', '-f', data_file('adult_crossentropy', 'train_proba'), '-t', test_file, '--column-description', cd_file, '--boosting-type', boosting_type, '-i', '3', '-T', '4', '-r', '0', '-m', output_model_path, '--ctr', ctr_type ) fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_has_time(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult', 'train.cd') test_file = data_file('adult', 'test_small') params = ( '--use-best-model', 'false', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '-t', test_file, '--column-description', cd_file, '--boosting-type', boosting_type, '-i', '10', '-w', '0.03', '-T', '4', '-r', '0', '--has-time', '-m', output_model_path, ) fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_verbose_int(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=5) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 2) with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=False) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 10) log_files = [] for i in range(3): log_files.append(JSON_LOG_PATH[:-5]+str(i)+JSON_LOG_PATH[-5:]) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[0]}, verbose=5) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 2) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[1]}, verbose=False) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[2]}, verbose=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 10) canonical_files = [] for log_file in log_files: canonical_files.append(local_canonical_file(remove_time_from_json(log_file))) return canonical_files
def test_feature_id_fstr(): model_path = yatest.common.test_output_path('adult_model.bin') output_fstr_path = yatest.common.test_output_path('fstr.tsv') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '--column-description', data_file('adult', 'train.cd'), '-i', '10', '-T', '4', '-r', '0', '-m', model_path, ) yatest.common.execute(cmd) fstr_cmd = ( CATBOOST_PATH, 'fstr', '--input-path', data_file('adult', 'train_small'), '--column-description', data_file('adult_with_id.cd'), '-m', model_path, '-o', output_fstr_path, ) yatest.common.execute(fstr_cmd) return local_canonical_file(output_fstr_path)
def test_class_weight_with_lost_class(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'MultiClass', '-f', data_file('cloudness_lost_class', 'train_small'), '-t', data_file('cloudness_lost_class', 'test_small'), '--column-description', data_file('cloudness_lost_class', 'train.cd'), '-i', '10', '-T', '4', '-r', '0', '-m', output_model_path, '--eval-file', output_eval_path, '--classes-count', '3', '--class-weights', '0.5,2,2' ) yatest.common.execute(cmd) return [local_canonical_file(output_eval_path)]
def test_meta(): pool = 'no_split' output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') meta_path = 'meta.tsv' cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'RMSE', '-f', data_file(pool, 'train_full3'), '-t', data_file(pool, 'test3'), '--column-description', data_file(pool, 'train_full3.cd'), '-i', '10', '-T', '4', '-r', '0', '-m', output_model_path, '--eval-file', output_eval_path, '--name', 'test experiment', ) yatest.common.execute(cmd) return [local_canonical_file(meta_path)]
def test_multi_leaf_estimation_method(leaf_estimation_method): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'MultiClass', '-f', data_file('cloudness_small', 'train_small'), '-t', data_file('cloudness_small', 'test_small'), '--column-description', data_file('cloudness_small', 'train.cd'), '-i', '10', '-T', '4', '-r', '0', '-m', output_model_path, '--eval-file', output_eval_path, '--leaf-estimation-method', leaf_estimation_method, '--gradient-iterations', '2' ) yatest.common.execute(cmd) return [local_canonical_file(output_eval_path)]
def test_nan_mode(nan_mode, boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') test_file = data_file('adult_nan', 'test_small') cd_file = data_file('adult_nan', 'train.cd') params = { '--use-best-model': 'false', '-f': data_file('adult_nan', 'train_small'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '20', '-T': '4', '-r': '0', '-m': output_model_path, '--nan-mode': nan_mode } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_custom_priors(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') test_file = data_file('adult', 'test_small') cd_file = data_file('adult', 'train.cd') params = ( '--use-best-model', 'false', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '-t', test_file, '--column-description', cd_file, '--boosting-type', boosting_type, '-i', '10', '-w', '0.03', '-T', '4', '-r', '0', '-m', output_model_path, '--ctr', 'Borders:Prior=-2:Prior=0:Prior=8/3:Prior=1:Prior=-1:Prior=3,' 'FeatureFreq:Prior=0', '--per-feature-ctr', '4:Borders:Prior=0.444,FeatureFreq:Prior=0.444;' '6:Borders:Prior=0.666,FeatureFreq:Prior=0.666;' '8:Borders:Prior=-0.888:Prior=2/3,FeatureFreq:Prior=-0.888:Prior=0.888' ) fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_fold_len_mult(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_not_binarized', 'train.cd') test_file = data_file('adult_not_binarized', 'test_small') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '-f': data_file('adult_not_binarized', 'train_small'), '-t': test_file, '--column-description': cd_file, '--boosting-type': 'Ordered', '-i': '10', '-T': '4', '-r': '0', '--fold-len-multiplier': 1.2, '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_crossentropy(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_crossentropy', 'train.cd') test_file = data_file('adult_crossentropy', 'test_proba') params = { '--loss-function': 'CrossEntropy', '-f': data_file('adult_crossentropy', 'train_proba'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '10', '-w': '0.03', '-T': '4', '-r': '0', '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_pairlogit_approx_on_full_history(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'PairLogit', '-f', data_file('zen', 'learn_small.tsv'), '-t', data_file('zen', 'test_small.tsv'), '--column-description', data_file('zen', 'zen.cd'), '--learn-pairs', data_file('zen', 'learn_pairs.tsv'), '--test-pairs', data_file('zen', 'test_pairs.tsv'), '--approx-on-full-history', '-i', '20', '-T', '4', '-r', '0', '-m', output_model_path, '--eval-file', output_eval_path, ) yatest.common.execute(cmd) return [local_canonical_file(output_eval_path)]
def test_custom_priors(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '-t', data_file('adult', 'test_small'), '--column-description', data_file('adult', 'train.cd'), '-i', '10', '-T', '4', '-r', '0', '-m', output_model_path, '--priors', '-2:0:8:1:-1:3', '--ctr-priors', '0:0.111,1:0.222', '--feature-priors', '4:0.444,6:0.666,8:-0.888:0.888', '--feature-ctr-priors', '4:0:0.4040,8:1:0.8181', '--eval-file', output_eval_path, ) yatest.common.execute(cmd) return [local_canonical_file(output_eval_path)]
def test_shap(): train_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 5, 8], cat_features=[]) test_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]]) model = CatBoostRegressor(iterations=1, random_seed=0, max_ctr_complexity=1, depth=2) model.fit(train_pool) shap_values = model.get_feature_importance(fstr_type=EFstrType.ShapValues, data=test_pool) dataset = [(0.5, 1.2), (1.6, 0.5), (1.8, 1.0), (0.4, 0.6), (0.3, 1.6), (1.5, 0.2)] labels = [1.1, 1.85, 2.3, 0.7, 1.1, 1.6] train_pool = Pool(dataset, labels, cat_features=[]) model = CatBoost({'iterations': 10, 'random_seed': 0, 'max_ctr_complexity': 1}) model.fit(train_pool) testset = [(0.6, 1.2), (1.4, 0.3), (1.5, 0.8), (1.4, 0.6)] predictions = model.predict(testset) shap_values = model.get_feature_importance(fstr_type=EFstrType.ShapValues, data=Pool(testset)) assert(len(predictions) == len(shap_values)) for pred_idx in range(len(predictions)): assert(abs(sum(shap_values[pred_idx]) - predictions[pred_idx]) < 1e-9) np.savetxt(FIMP_TXT_PATH, shap_values) return local_canonical_file(FIMP_TXT_PATH)
def test_only_categorical_features(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '-t', data_file('adult', 'test_small'), '--column-description', data_file('adult_all_categorical.cd'), '-i', '100', '-T', '4', '-r', '0', '-m', output_model_path, '--eval-file', output_eval_path, '-x', '1', '-n', '8', '-w', '0.1', ) yatest.common.execute(cmd) return [local_canonical_file(output_eval_path)]
def test_eval_set(): dataset = [(1, 2, 3, 4), (2, 2, 3, 4), (3, 2, 3, 4), (4, 2, 3, 4)] labels = [1, 2, 3, 4] train_pool = Pool(dataset, labels, cat_features=[0, 3, 2]) model = CatBoost({ 'learning_rate': 1, 'loss_function': 'RMSE', 'iterations': 2, 'random_seed': 0 }) eval_dataset = [(5, 6, 6, 6), (6, 6, 6, 6)] eval_labels = [5, 6] eval_pool = (eval_dataset, eval_labels) model.fit(train_pool, eval_set=eval_pool) eval_pools = [eval_pool] model.fit(train_pool, eval_set=eval_pools) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_fstr(fstr_type, boosting_type): model_path = yatest.common.test_output_path('adult_model.bin') output_fstr_path = yatest.common.test_output_path('fstr.tsv') fit_params = ('--use-best-model', 'false', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '--column-description', data_file('adult', 'train.cd'), '--boosting-type', boosting_type, '-i', '10', '-w', '0.03', '-T', '4', '-r', '0', '--one-hot-max-size', '10', '-m', model_path) if fstr_type == 'ShapValues': fit_params += ('--max-ctr-complexity', '1') fit_catboost_gpu(fit_params) fstr_params = ('--input-path', data_file('adult', 'train_small'), '--column-description', data_file('adult', 'train.cd'), '-m', model_path, '-o', output_fstr_path, '--fstr-type', fstr_type) fstr_catboost_cpu(fstr_params) return local_canonical_file(output_fstr_path)
def test_dist_train_many_trees(dev_score_calc_obj_block_size): pool_path = data_file('higgs', 'train_small') test_path = data_file('higgs', 'test_small') cd_path = data_file('higgs', 'train.cd') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'Logloss', '-f', pool_path, '-t', test_path, '--column-description', cd_path, '-i', '1000', '-w', '0.03', '-T', '4', '--random-strength', '0', '--has-time', '--bootstrap-type', 'No', '--dev-score-calc-obj-block-size', dev_score_calc_obj_block_size, ) eval_path = yatest.common.test_output_path('test.eval') execute_dist_train(cmd + ( '--eval-file', eval_path, )) return [local_canonical_file(eval_path)]
def test_quantized_pool(loss_function, boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') quantized_train_file = 'quantized://' + data_file('quantized_adult', 'train.qbin') quantized_test_file = 'quantized://' + data_file('quantized_adult', 'test.qbin') params = ( '--use-best-model', 'false', '--loss-function', loss_function, '-f', quantized_train_file, '-t', quantized_test_file, '--boosting-type', boosting_type, '-i', '10', '-w', '0.03', '-T', '4', '-r', '0', '-m', output_model_path, ) fit_catboost_gpu(params) cd_file = data_file('quantized_adult', 'pool.cd') test_file = data_file('quantized_adult', 'test_small.tsv') apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_custom_priors(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '-t', data_file('adult', 'test_small'), '--column-description', data_file('adult', 'train.cd'), '-i', '10', '-T', '4', '-r', '0', '-m', output_model_path, '--ctr', 'Borders:Prior=-2:Prior=0:Prior=8:Prior=1:Prior=-1:Prior=3,' 'Counter:Prior=0', '--per-feature-ctr', '4:Borders:Prior=0.444,Counter:Prior=0.444;' '6:Borders:Prior=0.666,Counter:Prior=0.666;' '8:Borders:Prior=-0.888:Prior=0.888,Counter:Prior=-0.888:Prior=0.888', '--eval-file', output_eval_path, ) yatest.common.execute(cmd) return [local_canonical_file(output_eval_path)]
def test_weight_sampling_per_tree(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') learn_error_path = yatest.common.test_output_path('learn_error.tsv') test_error_path = yatest.common.test_output_path('test_error.tsv') cmd = ( CATBOOST_PATH, 'fit', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '-t', data_file('adult', 'test_small'), '--column-description', data_file('adult', 'train.cd'), '-i', '10', '-T', '4', '-r', '0', '-m', output_model_path, '--eval-file', output_eval_path, '--learn-err-log', learn_error_path, '--test-err-log', test_error_path, '--weight-sampling-frequency', 'PerTree', ) yatest.common.execute(cmd) return local_canonical_file(output_eval_path)
def compare_canonical_models(*args, **kwargs): return local_canonical_file(*args, diff_tool=model_diff_tool, **kwargs)
def test_python_export_no_cat_features(): train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE) model = CatBoost({'iterations': 2, 'random_seed': 0, 'loss_function': 'RMSE'}) model.fit(train_pool) model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python") return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)
def test_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.feature_importances_)) return local_canonical_file(FIMP_PATH)
def test_cv_with_not_binarized_target(): train_file = data_file('adult_not_binarized', 'train_small') cd = data_file('adult_not_binarized', 'train.cd') pool = Pool(train_file, column_description=cd) cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_python_export_with_cat_features(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({'iterations': 20, 'random_seed': 0}) model.fit(train_pool) model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python") return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
def test_cv_logging(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_train_on_binarized_equal_train_on_float(boosting_type, qwise_loss): output_model_path = yatest.common.test_output_path('model.bin') output_model_path_binarized = yatest.common.test_output_path( 'model_binarized.bin') test_error_path = yatest.common.test_output_path('test_error.tsv') learn_error_path = yatest.common.test_output_path('learn_error.tsv') borders_file = yatest.common.test_output_path('borders.tsv') borders_file_output = borders_file + '.out' predictions_path_learn = yatest.common.test_output_path( 'predictions_learn.tsv') predictions_path_learn_binarized = yatest.common.test_output_path( 'predictions_learn_binarized.tsv') predictions_path_test = yatest.common.test_output_path( 'predictions_test.tsv') predictions_path_test_binarized = yatest.common.test_output_path( 'predictions_test_binarized.tsv') learn_file = data_file('querywise', 'train') cd_file = data_file('querywise', 'train.cd') test_file = data_file('querywise', 'test') params = { "--loss-function": qwise_loss, "-f": learn_file, "-t": test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '100', '-T': '4', '-r': '0', '-m': output_model_path, '--learn-err-log': learn_error_path, '--test-err-log': test_error_path, '--use-best-model': 'false', '--output-borders-file': borders_file_output, } params_binarized = dict(params) params_binarized['--input-borders-file'] = borders_file_output params_binarized['--output-borders-file'] = borders_file params_binarized['-m'] = output_model_path_binarized fit_catboost_gpu(params) apply_catboost(output_model_path, learn_file, cd_file, predictions_path_learn) apply_catboost(output_model_path, test_file, cd_file, predictions_path_test) # learn_error_path and test_error_path already exist after first fit_catboost_gpu() call # and would be automatically marked as input_data for YT operation, # which will lead to error, because input files are available only for reading. # That's why we explicitly drop files from input_data and implicitly add them to output_data. fit_catboost_gpu(params_binarized, input_data={ learn_error_path: None, test_error_path: None }) apply_catboost(output_model_path_binarized, learn_file, cd_file, predictions_path_learn_binarized) apply_catboost(output_model_path_binarized, test_file, cd_file, predictions_path_test_binarized) assert (filecmp.cmp(predictions_path_learn, predictions_path_learn_binarized)) assert (filecmp.cmp(predictions_path_test, predictions_path_test_binarized)) return [ local_canonical_file(learn_error_path, diff_tool=diff_tool()), local_canonical_file(test_error_path, diff_tool=diff_tool()), local_canonical_file(predictions_path_test, diff_tool=diff_tool()), local_canonical_file(predictions_path_learn, diff_tool=diff_tool()), local_canonical_file(borders_file, diff_tool=diff_tool()) ]
def test_shap_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, max_ctr_complexity=1) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='ShapValues'))) return local_canonical_file(FIMP_PATH)
def test_interaction_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction'))) return local_canonical_file(FIMP_PATH)