예제 #1
0
def with_pairs_with_eval_set():
    data_path = os.path.join(CATBOOST_TEST_DATA_DIR, "querywise")
    learn_set_path = os.path.join(data_path, "train")
    learn_set_pairs_path_with_scheme = 'dsv-grouped://' + os.path.join(
        data_path, "train.grouped_pairs")
    eval_set_path = os.path.join(data_path, "test")
    eval_set_pairs_path_with_scheme = 'dsv-grouped://' + os.path.join(
        data_path, "test.grouped_pairs")

    cd_path = os.path.join(data_path, "train.cd")

    model = utils.run_dist_train([
        '--iterations', '20', '--loss-function', 'PairLogit', '--learn-set',
        learn_set_path, '--learn-pairs', learn_set_pairs_path_with_scheme,
        '--test-set', eval_set_path, '--test-pairs',
        eval_set_pairs_path_with_scheme, '--cd', cd_path
    ],
                                 model_class=cb.CatBoostRegressor)
    eval_pool = cb.Pool(eval_set_path,
                        column_description=cd_path,
                        pairs=eval_set_pairs_path_with_scheme)

    result = {'prediction': model.predict(eval_pool).tolist()}

    json.dump(result,
              fp=open(
                  os.path.join(OUTPUT_DIR,
                               'regression_with_pairs_with_eval_set.json'),
                  'w'),
              allow_nan=True,
              indent=2)
예제 #2
0
def simple_on_dataframe():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv([(0.1, 0.2, 0.11, 0.12),
                                  (0.97, 0.82, 0.33, 1.1),
                                  (0.13, 0.22, 0.23, 2.1),
                                  (0.14, 0.18, 0.1, 0.0),
                                  (0.9, 0.67, 0.17, -1.0),
                                  (0.66, 0.1, 0.31, 0.62)], learn_set_path)
        with open(cd_path, 'w') as cd:
            cd.write('3\tTarget')

        model = utils.run_dist_train([
            '--iterations', '20', '--loss-function', 'RMSE', '--learn-set',
            learn_set_path, '--cd', cd_path
        ],
                                     model_class=cb.CatBoostRegressor)
        train_pool = cb.Pool(learn_set_path, column_description=cd_path)

        result = {'prediction': model.predict(train_pool).tolist()}

        json.dump(result,
                  fp=open(
                      os.path.join(OUTPUT_DIR,
                                   'regression_simple_on_dataframe.json'),
                      'w'),
                  allow_nan=True,
                  indent=2)

    finally:
        os.remove(learn_set_path)
        os.remove(cd_path)
예제 #3
0
def simple1():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv(
            [(0.13, 0.22, 0.23, "0.34", "query1", "Site9", 1.0),
             (0.1, 0.2, 0.11, "0.12", "query0", "site1", 0.12),
             (0.97, 0.82, 0.33, "0.22", "query0", "site22", 0.18),
             (0.9, 0.67, 0.17, "0.01", "Query 2", "site22", 1.0),
             (0.66, 0.1, 0.31, "0.0", "Query 2", "Site45", 2.0),
             (0.14, 0.18, 0.1, "0.42", "Query 2", "site12", 0.45)],
            learn_set_path)
        with open(cd_path, 'w') as cd:
            cd.write("3\tTarget\n" + "4\tGroupId\n" + "5\tSubgroupId\n" +
                     "6\tWeight\n")

        model = utils.run_dist_train([
            '--iterations', '20', '--loss-function', 'RMSE', '--learn-set',
            learn_set_path, '--cd', cd_path
        ],
                                     model_class=cb.CatBoostRegressor)
        train_pool = cb.Pool(learn_set_path, column_description=cd_path)

        result = {'prediction': model.predict(train_pool).tolist()}

        json.dump(result,
                  fp=open(os.path.join(OUTPUT_DIR, 'regression_simple1.json'),
                          'w'),
                  allow_nan=True,
                  indent=2)

    finally:
        os.remove(learn_set_path)
        os.remove(cd_path)
예제 #4
0
def overfitting_detector():
    data_path = os.path.join(CATBOOST_TEST_DATA_DIR, "querywise")
    learn_set_path = os.path.join(data_path, "train")
    eval_set_path = os.path.join(data_path, "test")
    cd_path = os.path.join(data_path, "train.cd")

    eval_pool = cb.Pool(eval_set_path, column_description=cd_path)

    result = {}

    for od_type in ['IncToDec', 'Iter']:
        if od_type == 'Iter':
            od_params = ['--od-wait', '20']
        else:
            od_params = ['--od-pval', '1.0e-2']

        model = utils.run_dist_train([
            '--iterations', '200', '--od-type', od_type, '--loss-function',
            'RMSE', '--learn-set', learn_set_path, '--test-set', eval_set_path,
            '--cd', cd_path
        ] + od_params,
                                     model_class=cb.CatBoostRegressor)

        result[f'prediction_{od_type}'] = model.predict(eval_pool).tolist()

    json.dump(result,
              fp=open(
                  os.path.join(OUTPUT_DIR,
                               'regression_overfitting_detector.json'), 'w'),
              allow_nan=True,
              indent=2)
예제 #5
0
def interaction():
    dataset_dir = os.path.join(CATBOOST_TEST_DATA_DIR, 'querywise')
    learn_set_path = os.path.join(dataset_dir, "train")
    cd_path = os.path.join(dataset_dir, "train.cd")

    model = utils.run_dist_train([
        '--iterations', '20', '--loss-function', 'QueryRMSE', '--learn-set',
        learn_set_path, '--cd', cd_path
    ],
                                 model_class=cb.CatBoostRegressor)

    result = []

    for firstFeatureIndex, secondFeatureIndex, score in model.get_feature_importance(
            type=cb.EFstrType.Interaction):
        result.append({
            "firstFeatureIndex": int(firstFeatureIndex),
            "secondFeatureIndex": int(secondFeatureIndex),
            "score": score
        })

    json.dump(result,
              fp=open(
                  os.path.join(OUTPUT_DIR,
                               'feature_importance_interaction.json'), 'w'),
              allow_nan=True,
              indent=2)
예제 #6
0
def prediction_diff():
    dataset_dir = os.path.join(CATBOOST_TEST_DATA_DIR, 'higgs')
    learn_set_path = os.path.join(dataset_dir, "train_small")
    cd_path = os.path.join(dataset_dir, "train.cd")

    model = utils.run_dist_train([
        '--iterations', '20', '--loss-function', 'RMSE', '--learn-set',
        learn_set_path, '--cd', cd_path
    ],
                                 model_class=cb.CatBoostRegressor)
    train_pool = cb.Pool(learn_set_path, column_description=cd_path)

    result = {}

    result['simple'] = model.get_feature_importance(
        type=cb.EFstrType.PredictionDiff,
        data=train_pool.get_features()[:2]).tolist()

    prettified_result = model.get_feature_importance(
        type=cb.EFstrType.PredictionDiff,
        data=train_pool.get_features()[:2],
        prettified=True)

    result['prettified'] = [{
        "featureName": prettified_result['Feature Id'][i],
        "importance": prettified_result['Importances'][i]
    } for i in range(len(prettified_result.index))]

    json.dump(result,
              fp=open(
                  os.path.join(OUTPUT_DIR,
                               'feature_importance_prediction_diff.json'),
                  'w'),
              allow_nan=True,
              indent=2)
예제 #7
0
def with_eval_sets():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    eval_sets_paths = [
        tempfile.mkstemp(prefix='catboost_eval_set_')[1] for i in range(2)
    ]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv(
            [(0.13, 0.22, 0.23, "0.34", "query1", "Site9", 1.0),
             (0.1, 0.2, 0.11, "0.12", "query0", "site1", 0.12),
             (0.97, 0.82, 0.33, "0.22", "query0", "site22", 0.18),
             (0.9, 0.67, 0.17, "0.01", "Query 2", "site22", 1.0),
             (0.66, 0.1, 0.31, "0.0", "Query 2", "Site45", 2.0),
             (0.14, 0.18, 0.1, "0.42", "Query 2", "site12", 0.45)],
            learn_set_path)
        utils.object_list_to_tsv(
            [(0.0, 0.33, 1.1, "0.22", "query3", "site1", 0.1),
             (0.02, 0.0, 0.38, "0.11", "query5", "Site9", 1.0),
             (0.86, 0.54, 0.9, "0.48", "query4", "site22", 0.17)],
            eval_sets_paths[0])
        utils.object_list_to_tsv(
            [(0.12, 0.28, 2.2, "0.1", "query3", "site1", 0.11),
             (0.0, 0.0, 0.92, "0.9", "query5", "Site9", 1.1),
             (0.13, 2.1, 0.45, "0.88", "query5", "Site33", 1.2),
             (0.17, 0.11, 0.0, "0.0", "Query12", "site22", 1.0)],
            eval_sets_paths[1])
        with open(cd_path, 'w') as cd:
            cd.write("3\tTarget\n" + "4\tGroupId\n" + "5\tSubgroupId\n" +
                     "6\tWeight\n")

        model = utils.run_dist_train([
            '--iterations', '20', '--loss-function', 'RMSE', '--learn-set',
            learn_set_path, '--test-set', eval_sets_paths[0], '--test-set',
            eval_sets_paths[1], '--cd', cd_path
        ],
                                     model_class=cb.CatBoostRegressor)
        eval_pools = [
            cb.Pool(eval_set_path, column_description=cd_path)
            for eval_set_path in eval_sets_paths
        ]

        result = dict([(f'prediction{i}',
                        model.predict(eval_pools[i]).tolist())
                       for i in range(2)])

        json.dump(result,
                  fp=open(
                      os.path.join(OUTPUT_DIR,
                                   'regression_with_eval_sets.json'), 'w'),
                  allow_nan=True,
                  indent=2)

    finally:
        os.remove(learn_set_path)
        [os.remove(eval_set_path) for eval_set_path in eval_sets_paths]
        os.remove(cd_path)
예제 #8
0
def shap_interaction_values():
    result = {}
    for problem_type in ['Regression', 'BinClass', 'MultiClass']:
        if problem_type == 'Regression':
            dataset_dir = os.path.join(CATBOOST_TEST_DATA_DIR, 'higgs')
            learn_set_path = os.path.join(dataset_dir, "train_small")
            cd_path = os.path.join(dataset_dir, "train.cd")
            loss_function = 'RMSE'
            additional_train_params = []
            model_class = cb.CatBoostRegressor
        elif problem_type == 'BinClass':
            dataset_dir = os.path.join(CATBOOST_TEST_DATA_DIR, 'higgs')
            learn_set_path = os.path.join(dataset_dir, "train_small")
            cd_path = os.path.join(dataset_dir, "train.cd")
            loss_function = 'Logloss'
            additional_train_params = []
            model_class = cb.CatBoostClassifier
        elif problem_type == 'MultiClass':
            dataset_dir = os.path.join(CATBOOST_TEST_DATA_DIR,
                                       'cloudness_small')
            learn_set_path = os.path.join(dataset_dir, "train_small")
            cd_path = os.path.join(dataset_dir, "train_float.cd")
            loss_function = 'MultiClass'
            additional_train_params = []
            model_class = cb.CatBoostClassifier

        model = utils.run_dist_train([
            '--iterations', '20', '--loss-function', loss_function,
            '--learn-set', learn_set_path, '--cd', cd_path
        ] + additional_train_params,
                                     model_class=model_class)
        model.save_model(
            os.path.join(
                OUTPUT_DIR,
                "feature_importance_shap_interaction_values.problem_type=" +
                problem_type + ".cbm"))
        pool_for_feature_importance = cb.Pool(
            learn_set_path, column_description=cd_path).slice([0, 1, 2, 3, 4])

        for shap_mode in ['Auto', 'UsePreCalc', 'NoPreCalc']:
            for shap_calc_type in ['Regular']:
                result_name = ('problem_type=' + problem_type + ',shap_mode=' +
                               shap_mode + ',shap_calc_type=' + shap_calc_type)
                result[result_name] = model.get_feature_importance(
                    type=cb.EFstrType.ShapInteractionValues,
                    data=pool_for_feature_importance,
                    shap_mode=shap_mode,
                    shap_calc_type=shap_calc_type).tolist()

    json.dump(result,
              fp=open(
                  os.path.join(
                      OUTPUT_DIR,
                      'feature_importance_shap_interaction_values.json'), 'w'),
              allow_nan=True,
              indent=2)
예제 #9
0
def simple_binary_classification():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv(
            [
                (0.1, 0.2, 0.11, "0", "query0", 1.0, "site1", 0.12),
                (0.97, 0.82, 0.33, "0", "query0", 1.0, "site22", 0.18),
                (0.13, 0.22, 0.23, "1", "query1", 0.0, "Site9", 1.0),
                (0.14, 0.18, 0.1, "1", "Query 2", 0.5, "site12", 0.45),
                (0.9, 0.67, 0.17, "0", "Query 2", 0.5, "site22", 1.0),
                (0.66, 0.1, 0.31, "1", "Query 2", 0.5, "Site45", 2.0)
            ],
            learn_set_path
        )
        with open(cd_path, 'w') as cd:
            cd.write(
                "3\tTarget\n"
                + "4\tGroupId\n"
                + "5\tGroupWeight\n"
                + "6\tSubgroupId\n"
                + "7\tWeight\n"
            )

        model = utils.run_dist_train(
            ['--iterations', '20',
             '--loss-function', 'Logloss',
             '--learn-set', learn_set_path,
             '--cd', cd_path
            ],
            model_class=cb.CatBoostClassifier
        )
        train_pool = cb.Pool(learn_set_path, column_description=cd_path)

        result = {}

        raw_predictions = np.array(model.predict(train_pool, prediction_type='RawFormulaVal'), ndmin=2).transpose()
        result['raw_prediction'] = np.hstack((np.negative(raw_predictions / 2), raw_predictions / 2)).tolist()
        result['probability'] = model.predict_proba(train_pool).tolist()
        result['prediction'] = model.predict(train_pool).tolist()

        json.dump(
            result,
            fp=open(os.path.join(OUTPUT_DIR, 'simple_binary_classification.json'), 'w'),
            allow_nan=True,
            indent=2
        )

    finally:
        os.remove(learn_set_path)
        os.remove(cd_path)
def binary_classification_with_class_weights_map():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv([(0.1, 0.2, 0.11, 0), (0.97, 0.82, 0.33, 1),
                                  (0.13, 0.22, 0.23, 1), (0.14, 0.18, 0.1, 0),
                                  (0.9, 0.67, 0.17, 0), (0.66, 0.1, 0.31, 0)],
                                 learn_set_path)
        with open(cd_path, 'w') as cd:
            cd.write('3\tTarget')

        model = utils.run_dist_train([
            '--iterations',
            '20',
            '--class-weights',
            '1,2',
            '--loss-function',
            'Logloss',
            '--learn-set',
            learn_set_path,
            '--cd',
            cd_path,
        ],
                                     model_class=cb.CatBoostClassifier)
        train_pool = cb.Pool(learn_set_path, column_description=cd_path)

        result = {}

        raw_predictions = np.array(model.predict(
            train_pool, prediction_type='RawFormulaVal'),
                                   ndmin=2).transpose()
        result['raw_prediction'] = np.hstack(
            (np.negative(raw_predictions / 2), raw_predictions / 2)).tolist()
        result['probability'] = model.predict_proba(train_pool).tolist()
        result['prediction'] = model.predict(train_pool).tolist()

        json.dump(result,
                  fp=open(
                      os.path.join(
                          OUTPUT_DIR,
                          'binary_classification_with_class_weights_map.json'),
                      'w'),
                  allow_nan=True,
                  indent=2)

    finally:
        os.remove(learn_set_path)
        os.remove(cd_path)
예제 #11
0
def num_and_one_hot_cat_features():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv([
            (0.13, 0.22, 0.23, 0.72, 0, 0, 0, "0.34", "query1", "Site9", 1.0),
            (0.1, 0.2, 0.11, -0.7, 1, 1, 0, "0.12", "query0", "site1", 0.12),
            (0.97, 0.82, 0.33, 0.18, 0, 2, 1, "0.22", "query0", "site22",
             0.18),
            (0.9, 0.67, 0.17, 0.0, 1, 2, 2, "0.01", "Query 2", "site22", 1.0),
            (0.66, 0.1, 0.31, -0.12, 0, 0, 3, "0.0", "Query 2", "Site45", 2.0),
            (0.14, 0.18, 0.1, 0.0, 0, 0, 4, "0.42", "Query 2", "site12", 0.45),
            (1.0, 0.88, 0.21, 0.0, 1, 3, 5, "0.1", "Query 3", "site1", 1.0)
        ], learn_set_path)
        with open(cd_path, 'w') as cd:
            cd.write("0\tNum\tf1\n" + "1\tNum\tf2\n" + "2\tNum\tf3\n" +
                     "3\tNum\tf4\n" + "4\tCateg\tc1\n" + "5\tCateg\tc2\n" +
                     "6\tCateg\tc3\n" + "7\tTarget\n" + "8\tGroupId\n" +
                     "9\tSubgroupId\n" + "10\tWeight\n")

        model = utils.run_dist_train([
            '--iterations', '20', '--loss-function', 'RMSE',
            '--one-hot-max-size', '6', '--dev-efb-max-buckets', '0',
            '--has-time', '--learn-set', learn_set_path, '--cd', cd_path
        ],
                                     model_class=cb.CatBoostRegressor)
        train_pool = cb.Pool(learn_set_path, column_description=cd_path)

        result = {'prediction': model.predict(train_pool).tolist()}

        json.dump(result,
                  fp=open(
                      os.path.join(
                          OUTPUT_DIR,
                          'regression_num_and_one_hot_cat_features.json'),
                      'w'),
                  allow_nan=True,
                  indent=2)

    finally:
        os.remove(learn_set_path)
        os.remove(cd_path)
예제 #12
0
def loss_function_change():
    dataset_dir = os.path.join(CATBOOST_TEST_DATA_DIR, 'querywise')
    learn_set_path = os.path.join(dataset_dir, "train")
    cd_path = os.path.join(dataset_dir, "train.cd")

    model = utils.run_dist_train([
        '--iterations', '20', '--loss-function', 'QueryRMSE', '--learn-set',
        learn_set_path, '--cd', cd_path
    ],
                                 model_class=cb.CatBoostRegressor)
    train_pool = cb.Pool(learn_set_path, column_description=cd_path)

    result = {}
    for calc_type in ['Regular', 'Approximate', 'Exact']:
        result['calc_type_' + calc_type] = model.get_feature_importance(
            type=cb.EFstrType.LossFunctionChange,
            data=train_pool,
            shap_calc_type=calc_type).tolist()

        prettified_result = model.get_feature_importance(
            type=cb.EFstrType.LossFunctionChange,
            data=train_pool,
            prettified=True,
            shap_calc_type=calc_type)

        result['calc_type_' + calc_type + '_prettified'] = [{
            "featureName":
            prettified_result['Feature Id'][i],
            "importance":
            prettified_result['Importances'][i]
        } for i in range(len(prettified_result.index))]

    json.dump(result,
              fp=open(
                  os.path.join(OUTPUT_DIR,
                               'feature_importance_loss_function_change.json'),
                  'w'),
              allow_nan=True,
              indent=2)
예제 #13
0
def num_and_one_hot_cat_features_with_eval_sets():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    eval_sets_paths = [
        tempfile.mkstemp(prefix='catboost_eval_set_')[1] for i in range(2)
    ]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv([
            (0.13, 0.22, 0.23, 0.72, 0, 0, 0, "0.34", "query1", "Site9", 1.0),
            (0.1, 0.2, 0.11, -0.7, 1, 1, 0, "0.12", "query0", "site1", 0.12),
            (0.97, 0.82, 0.33, 0.18, 0, 2, 1, "0.22", "query0", "site22",
             0.18),
            (0.9, 0.67, 0.17, 0.0, 1, 2, 2, "0.01", "Query 2", "site22", 1.0),
            (0.66, 0.1, 0.31, -0.12, 0, 0, 3, "0.0", "Query 2", "Site45", 2.0),
            (0.14, 0.18, 0.1, 0.0, 0, 0, 4, "0.42", "Query 2", "site12", 0.45),
            (1.0, 0.88, 0.21, 0.0, 1, 3, 5, "0.1", "Query 3", "site1", 1.0)
        ], learn_set_path)
        utils.object_list_to_tsv([
            (0.0, 0.33, 1.1, 0.01, 0, 1, 2, "0.22", "query4", "site1", 0.1),
            (0.02, 0.0, 0.38, -0.3, 1, 2, 3, "0.11", "query5", "Site9", 1.0),
            (0.86, 0.54, 0.9, 0.0, 0, 2, 5, "0.48", "query5", "site22", 0.17)
        ], eval_sets_paths[0])
        utils.object_list_to_tsv([
            (0.12, 0.28, 2.2, -0.12, 1, 3, 3, "0.1", "query6", "site1", 0.11),
            (0.0, 0.0, 0.92, 0.0, 0, 3, 4, "0.9", "query6", "Site9", 1.1),
            (0.13, 2.1, 0.45, 1.0, 1, 2, 5, "0.88", "query6", "Site33", 1.2),
            (0.17, 0.11, 0.0, 2.11, 1, 0, 2, "0.0", "Query12", "site22", 1.0)
        ], eval_sets_paths[1])
        with open(cd_path, 'w') as cd:
            cd.write("0\tNum\tf1\n" + "1\tNum\tf2\n" + "2\tNum\tf3\n" +
                     "3\tNum\tf4\n" + "4\tCateg\tc1\n" + "5\tCateg\tc2\n" +
                     "6\tCateg\tc3\n" + "7\tTarget\n" + "8\tGroupId\n" +
                     "9\tSubgroupId\n" + "10\tWeight\n")

        model = utils.run_dist_train([
            '--iterations', '20', '--loss-function', 'RMSE',
            '--one-hot-max-size', '6', '--dev-efb-max-buckets', '0',
            '--has-time', '--learn-set', learn_set_path, '--test-set',
            eval_sets_paths[0], '--test-set', eval_sets_paths[1], '--cd',
            cd_path
        ],
                                     model_class=cb.CatBoostRegressor)
        eval_pools = [
            cb.Pool(eval_set_path, column_description=cd_path)
            for eval_set_path in eval_sets_paths
        ]

        result = dict([(f'prediction{i}',
                        model.predict(eval_pools[i]).tolist())
                       for i in range(2)])

        json.dump(
            result,
            fp=open(
                os.path.join(
                    OUTPUT_DIR,
                    'regression_num_and_one_hot_cat_features_with_eval_sets.json'
                ), 'w'),
            allow_nan=True,
            indent=2)

    finally:
        os.remove(learn_set_path)
        [os.remove(eval_set_path) for eval_set_path in eval_sets_paths]
        os.remove(cd_path)