def train_preprocessor(path='.', train='train.csv'):
    print('start train trash preprocessor...')
    df = pd.read_csv(os.path.join(path, train))

    train_data = df[:-100]
    validation_data = df[-100: -50]

    vectorizer = CountVectorizer()
    x_train_counts = vectorizer.fit_transform(train_data.text)
    x_validation_counts = vectorizer.transform(validation_data.text)

    model = CatBoostClassifier(iterations=250,
                               train_dir=path,
                               logging_level='Silent',
                               allow_writing_files=False
                               )

    model.fit(X=x_train_counts.toarray(),
              y=train_data.status,
              eval_set=(x_validation_counts.toarray(), validation_data.status),
              use_best_model=True,)

    model.save_model(os.path.join(path, 'trash_model'))
    joblib.dump(vectorizer,os.path.join(path, 'trash_vectorizer'))
    print('end train sentiment preprocessor...')
Exemplo n.º 2
0
def test_full_history():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(od_type='Iter', od_wait=20, random_seed=42, approx_on_full_history=True)
    model.fit(train_pool, eval_set=test_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 3
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros(pool.num_row())
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 4
0
def test_predict_sklearn_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2,
                               random_seed=0,
                               loss_function='Logloss:border=0.5')
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 5
0
def test_classification_ctr():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5,
                               random_seed=0,
                               ctr_description=['Borders', 'Counter'])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 6
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros((pool.num_row(), 2))
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 7
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    weight = np.arange(1, pool.num_row() + 1)
    pool.set_weight(weight)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 8
0
def test_class_weights():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5,
                               random_seed=0,
                               class_weights=[1, 2])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 9
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros((pool.num_row(), 2))
    pool = Pool(pool.get_features(), pool.get_label(), baseline=baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 10
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    weight = np.arange(1, pool.num_row()+1)
    pool.set_weight(weight)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 11
0
def test_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(pool.get_features(),
                 pool.get_label(),
                 weight=np.ones(pool.num_row()))
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool2)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 12
0
def save_catboost_model(catboost_model: CatBoostClassifier,
                        model_name: str,
                        pool_data: Pool) -> None:
    """Saves model `catboost_model` to `PATH_MODELS` with the name
    passed in `model_name`
    `pool_data` contains `Pool` object with features and labels used
    to fit the model and its categorical features
    """
    catboost_model.save_model(str(PATH_MODELS / model_name), pool=pool_data)
Exemplo n.º 13
0
def test_priors():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5,
                               random_seed=0,
                               has_time=True,
                               priors=[0, 0.6, 1, 5])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 14
0
def test_full_history():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(od_type='Iter',
                               od_wait=20,
                               random_seed=42,
                               approx_on_full_history=True)
    model.fit(train_pool, eval_set=test_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 15
0
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Exemplo n.º 16
0
def test_serialization_of_numpy_objects_save_model():
    train_pool = Pool(*random_xy(10, 5))
    model = CatBoostClassifier(
        iterations=np.int64(2),
        random_seed=np.int32(0),
        loss_function='Logloss'
    )
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH, format='coreml',
                     export_parameters=get_values_that_json_dumps_breaks_on())
Exemplo n.º 17
0
def test_non_zero_bazeline():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool))
    pool2 = Pool(pool.get_features(), pool.get_label(), baseline=baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool2)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 18
0
def fit_chain():
    train_file = '.././chains.df'
    cd_file = '.././chains.cd'
    train_pool = Pool(train_file, column_description=cd_file)
    model = CatBoostClassifier(depth=3,
                               iterations=100,
                               eval_metric='F1',
                               task_type='CPU')
    model.fit(train_pool)
    model.save_model('chain.model')
Exemplo n.º 19
0
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Exemplo n.º 20
0
def run_ex_gal(exp_name, data_type):
    tes_m = feather.read_dataframe('../others/tes_m.feather')
    le = load_pickle('../others/label_encoder.pkl')
    y = le.transform(np.load('../others/train_target.npy'))
    distmod_mask = np.load('../others/distmod_mask.npy')
    W = np.load('../others/W.npy')
    pseudo_idx = np.load('../others/pseudo_idx.npy')
    class_names = [99, 95, 92, 90, 88, 67, 65, 64, 62, 53, 52, 42, 16, 15, 6]
    obj_class = 90

    W_tr = np.zeros(14)
    W_tr[le.transform(class_names[1:])] = W[1:]
    real_weight = get_real_weight(y, W_tr)
    ex_gal_labels = np.where(np.bincount(y[distmod_mask]) != 0)[0]
    ex_gal_label_map = np.zeros(np.max(ex_gal_labels) + 1, dtype=np.int32)
    ex_gal_label_map[ex_gal_labels] = np.arange(ex_gal_labels.shape[0])

    X, X_pseudo = load_data_ex_gal(exp_name, data_type, pseudo_idx)
    y_pseudo = np.full(pseudo_idx.sum(),
                       ex_gal_label_map[le.transform([obj_class])][0])
    params = {
        'iterations':
        10000,
        'learning_rate':
        0.1,
        'depth':
        3,
        'loss_function':
        'MultiClass',
        'colsample_bylevel':
        0.7,
        'random_seed':
        0,
        'class_weights':
        real_weight[ex_gal_labels] / real_weight[ex_gal_labels].sum()
    }
    iterations = load_pickle('../fi/' + exp_name + '_rounds.pkl')
    iteration = iterations[data_type]
    params['iterations'] = iteration
    print('iteration: ' + str(params['iterations']))

    orig_size = np.bincount(
        ex_gal_label_map[y[distmod_mask]])[ex_gal_label_map[le.transform(
            [obj_class])][0]]
    whole_data = np.concatenate((X[distmod_mask], X_pseudo), axis=0)
    whole_labels = np.concatenate(
        (ex_gal_label_map[y[distmod_mask]], y_pseudo), axis=0)
    after_size = np.bincount(whole_labels)[ex_gal_label_map[le.transform(
        [obj_class])][0]]
    sample_weight = np.ones(whole_labels.shape[0])
    sample_weight[whole_labels == ex_gal_label_map[le.transform([obj_class])]
                  [0]] = orig_size / after_size
    model = CatBoostClassifier(**params)
    model.fit(whole_data, whole_labels, sample_weight=sample_weight)
    model.save_model('../models/' + exp_name + '_' + data_type + '.cbm')
Exemplo n.º 21
0
def train(odir, trname, tsname, split_col, rs, train_size):
    """Train gradient boosting model."""

    # To get consistent categories, we concatenate train and test
    train = pd.read_csv(trname, index_col="PassengerId")
    test = pd.read_csv(tsname, index_col="PassengerId")

    fts_cols = train.columns.drop("Survived")

    # Creating training/validation split
    cv = StratifiedShuffleSplit(n_splits=1,
                                train_size=train_size,
                                random_state=rs)

    tridx, cvidx = list(cv.split(train, train[split_col]))[0]

    # Fill missing values
    train.fillna(train.iloc[tridx].mean()[["Age", "Fare"]],
                 inplace=True)
    test.fillna(train.iloc[tridx].mean()[["Age", "Fare"]],
                inplace=True)

    # Creating the model
    model = CatBoostClassifier(iterations=500,
                               depth=4, rsm=0.75,
                               learning_rate=0.001,
                               early_stopping_rounds=250,
                               random_state=rs,
                               use_best_model=True)
    model.fit(train.iloc[tridx][fts_cols],
              train.iloc[tridx]["Survived"],
              eval_set=(train.iloc[cvidx][fts_cols],
                        train.iloc[cvidx]["Survived"]),
              verbose=50)

    # Measuring performance
    cv_predictions = model.predict_proba(train.iloc[cvidx][fts_cols])[:, 1]

    auc = roc_auc_score(train.iloc[cvidx, 0], cv_predictions)
    acc = accuracy_score(train.iloc[cvidx, 0], cv_predictions > 0.5)

    # Saving the model, metrics file and submission
    odir = pathlib.Path(odir)

    model.save_model(odir.joinpath("cb-model.cbm").as_posix())

    with open(odir.joinpath("cb-metrics.json"), "w") as metrics_file:
        metrics_file.write(json.dumps({"AUC": auc, "Accuracy": acc}))

    submission = pd.Series(model.predict(test.values),
                           name="Survived",
                           index=test.index,
                           dtype=np.int)

    submission.to_csv(odir.joinpath("cb-submission.csv"), header=True)
Exemplo n.º 22
0
 def train_catboost(self):
     catboost_pool = self.to_ml_input(self.train_pool.pool, "train")
     self.logger.info("train_catboost iterations count={}".format(self.args.iter_count))
     model = CatBoostClassifier(iterations=self.args.iter_count,
                                depth=4,
                                logging_level="Debug",
                                loss_function='MultiClass',
                                #verbose=True
                                )
     model.fit(catboost_pool)
     model.save_model(self.args.model_path)
Exemplo n.º 23
0
def test_od():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=1000,
                               learning_rate=0.03,
                               od_type='Iter',
                               od_wait=20,
                               random_seed=42)
    model.fit(train_pool, eval_set=test_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 24
0
def test_ignored_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3])
    model2 = CatBoostClassifier(iterations=5, random_seed=0)
    model1.fit(train_pool)
    model2.fit(train_pool)
    predictions1 = model1.predict(test_pool)
    predictions2 = model2.predict(test_pool)
    assert not _check_data(predictions1, predictions2)
    model1.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 25
0
def test_ignored_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3])
    model2 = CatBoostClassifier(iterations=5, random_seed=0)
    model1.fit(train_pool)
    model2.fit(train_pool)
    predictions1 = model1.predict(test_pool)
    predictions2 = model2.predict(test_pool)
    assert not _check_data(predictions1, predictions2)
    model1.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 26
0
def test_priors():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5,
                               random_seed=0,
                               has_time=True,
                               ctr_description=[
                                   "Borders:Prior=0:Prior=0.6:Prior=1:Prior=5",
                                   "Counter:Prior=0:Prior=0.6:Prior=1:Prior=5"
                               ])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 27
0
def main() -> None:
    gameid_column = 'game_id'
    all_csv_file_pathes = glob.glob('../data/*/*.csv')
    all_frames = []
    for csv_filepath in all_csv_file_pathes:
        game_frame = pd.read_csv(csv_filepath, index_col=False, sep=';')
        game_frame.insert(0, 'game_id', csv_filepath)
        all_frames.append(game_frame)

    all_games = pd.concat(all_frames, axis=0, ignore_index=True)
    frame = all_games.copy()
    cols = frame.columns[frame.dtypes.eq('object')]

    for col in cols:
        frame[col] = frame[col].astype('category')
        if col is not gameid_column:
            column_mapping = dict(enumerate(frame[col].cat.categories))
            save_column_mapping(col, column_mapping)
        frame[col] = frame[col].cat.codes

    defects = frame.loc[frame["CT-Win"] == -1]
    frame.drop(defects.index, inplace=True)
    frame.dropna(axis='rows')

    y = frame["CT-Win"].astype(int)

    X = frame.drop(["CT-Win"], axis=1)
    X = X.drop(['game_id'], axis=1)

    all_column_map = {}
    for col_idx, col_name in enumerate(X.columns):
        all_column_map[col_idx] = col_name

    save_column_mapping('order.json', all_column_map)

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric="AUC",
        border_count=CTB_MODEL_PARAMETERS['border_count'],
        thread_count=CTB_MODEL_PARAMETERS['thread_count'],
        random_seed=CTB_MODEL_PARAMETERS['random_seed'],
        depth=CTB_MODEL_PARAMETERS['depth'],
        od_wait=CTB_MODEL_PARAMETERS['od_wait'],
        l2_leaf_reg=CTB_MODEL_PARAMETERS['l2_leaf_reg'],
        iterations=CTB_MODEL_PARAMETERS['iterations'],
        learning_rate=CTB_MODEL_PARAMETERS['learning_rate'],
        od_type='Iter')
    ctb_data = Pool(X, y)
    model.fit(ctb_data, verbose=False)

    os.makedirs('./' + MODEL_FOLDER, exist_ok=True)
    model.save_model('./' + MODEL_FOLDER + '/' + MODEL_NAME)
Exemplo n.º 28
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 29
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 30
0
def test_non_zero_bazeline():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2,
                                    random_seed=0,
                                    loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(
        base_model.predict(pool, prediction_type='RawFormulaVal'))
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 31
0
class CatboostEnsemble(Ensemble):
    def __init__(self, params: dict, dataset: Dataset = None):
        super().__init__(params, dataset, name='CatboostEnsemble')
        self.clf = CatBoostClassifier(**params)
        self.tmp_json_path = '/tmp/catboost.model.json'

    def fit(self, dataset: Dataset):
        self.set_dataset(dataset)

        loss_function = 'MultiClass' if self.dataset.num_classes(
        ) > 2 else 'Logloss'
        self.clf.set_params(loss_function=loss_function, verbose=False)

        self.clf.fit(self.dataset.X, self.dataset.y)

        self.clf.save_model(self.tmp_json_path, format='json')
        with open(self.tmp_json_path, 'r') as fp:
            model = json.load(fp)

        self.trees = [
            CatboostTree.parse(tree, self.dataset)
            for tree in model['oblivious_trees']
        ]

    def predict_proba(self, dataset: Dataset) -> np.ndarray:
        if len(self.trees) == 0:
            raise ValueError('There are no trees available')

        encoded_dataset = self.encode_dataset(dataset)

        n_classes = len(self.clf.classes_)  # pylint: disable=no-member

        # TODO: For single tree this is just [tree.predict(...)]
        preds = np.array(
            [tree.predict(encoded_dataset.X) for tree in self.trees])
        preds = np.sum(preds, axis=0)

        if n_classes > 2:
            # https://catboost.ai/docs/concepts/loss-functions-multiclassification.html
            # Link above suggests different equation for this
            # results_proba = softmax(preds, axis=1)
            raise NotImplementedError('Only binary problems are implemented.')
        else:
            results_proba = np.array([[1 - v, v] for v in expit(preds)])

        return results_proba

    def predict(self, dataset: Dataset) -> np.ndarray:
        results_proba = self.predict_proba(dataset)
        results_cls = np.argmax(results_proba, axis=1)
        return results_cls
def _test_influence_vs_tf_derivative(leaf_method):
    base_dir = 'data/adult/'
    train_documents, train_targets = read_train_documents_and_one_hot_targets(
        base_dir + 'train_data_catboost_format.tsv'
    )
    train_documents = train_documents[:100]
    train_targets = train_targets[:100]

    train_targets = np.argmax(train_targets, axis=1)

    test_documents, test_targets = read_train_documents_and_one_hot_targets(
        base_dir + 'test_data_catboost_format.tsv'
    )
    test_targets = np.argmax(test_targets, axis=1)

    train_dir = base_dir + 'ut_tmp/'
    if not isdir(train_dir):
        mkdir(train_dir)
    cbc_params = read_json_params(base_dir + 'catboost_params.json')
    cbc_params['iterations'] = 2
    cbc_params['leaf_estimation_method'] = leaf_method
    cbc_params['random_seed'] = 10
    cbc_params['train_dir'] = train_dir
    cbc = CatBoostClassifier(**cbc_params)
    cbc.set_params(boosting_type='Plain')
    cbc.fit(train_documents, train_targets)
    cbc.save_model(train_dir + 'model.bin', format='cbm')
    export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json')
    full_model = CBLeafInfluenceEnsemble(train_dir + 'model.json', train_documents, train_targets,
                                         leaf_method=leaf_method,
                                         learning_rate=cbc_params['learning_rate'],
                                         loss_function=BinaryCrossEntropyLoss(),
                                         update_set='AllPoints')
    retrained_model_our = deepcopy(full_model)
    tf_checker = TFGBApplier(full_model, train_documents, train_targets, leaf_method)
    for remove_idx in np.random.randint(len(train_targets), size=30):
        full_model.fit(remove_idx, retrained_model_our)
        pred_ours = full_model(train_documents)
        pred_theirs = tf_checker.get_predicts()
        pred_cbc = cbc.predict(train_documents, prediction_type='RawFormulaVal')
        assert np.allclose(pred_ours, pred_theirs, rtol=1e-3) and np.allclose(pred_ours, pred_cbc, rtol=1e-3), (pred_ours, pred_theirs)

        der_ours = [t.leaf_values for t in retrained_model_our.influence_trees]
        der_theirs = tf_checker.get_derivs(remove_idx)
        assert all(np.allclose(o, t, rtol=1e-2) for o, t in zip(der_ours, der_theirs)), (der_ours, der_theirs)

        random_train_idx = np.random.randint(len(train_targets))
        der_pred_ours = retrained_model_our.loss_derivative(train_documents[[random_train_idx]],
                                                            train_targets[[random_train_idx]])[0]
        der_pred_theirs = tf_checker.get_train_prediction_deriv(remove_idx, random_train_idx)
        assert np.isclose(der_pred_ours, der_pred_theirs, rtol=1e-2), (der_pred_ours, der_pred_theirs)
Exemplo n.º 33
0
def train_gbm(n_epochs=100):
    df = pd.read_csv('./data/df_super.csv')

    x, y = create_gbm_dataset(df)
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size=0.9)

    model = CatBoostClassifier(
        iterations=n_epochs, 
        learning_rate=0.01
        )

    model.fit(xtrain, ytrain, eval_set=(xtest, ytest))

    model.save_model('./models/gbm.cbm')
Exemplo n.º 34
0
def train_call_models():
    for i, name in enumerate(names):
        print(i, name)
        y_bot = y[y['private_bot_name'] == name]
        X_game = X.loc[y_bot.index]
        y_game = y_bot['private_bot_action'].replace({'FOLD': 0, 'CALL': 1, 'RAISE': 1})

        X_train, X_test, y_train, y_test = train_test_split(X_game, y_game, test_size=0.1, random_state=1234)

        model = CatBoostClassifier(iterations=200, learning_rate=0.1, depth=8, thread_count=4,
                                   verbose=True, use_best_model=True)
        model.fit(X_train, y_train, eval_set=(X_test, y_test))

        model.save_model(model_dir + 'pool/call/' + str(i) + '.model')
Exemplo n.º 35
0
def generate_ensemble_classification(dataset_name,
                                     params,
                                     alg="sgb",
                                     num_models=10):

    # load and prepare data
    data_dir = os.path.join('datasets', dataset_name)
    full_train_file = os.path.join(data_dir, 'full_train')
    test_file = os.path.join(data_dir, 'test')
    cd_file = os.path.join(data_dir, 'pool.cd')

    full_train_pool = Pool(data=full_train_file, column_description=cd_file)
    test_pool = Pool(data=test_file, column_description=cd_file)

    # parameters
    depth = params['depth']
    lr = params['lr']
    sample = params['sample']

    seed = 0
    for i in range(num_models):
        if alg == 'sgb' or alg == 'sgb-fixed':
            model = CatBoostClassifier(loss_function='Logloss',
                                       verbose=False,
                                       learning_rate=lr,
                                       depth=depth,
                                       subsample=sample,
                                       bootstrap_type='Bernoulli',
                                       custom_metric='ZeroOneLoss',
                                       random_seed=seed)
        if alg == 'sglb' or alg == 'sglb-fixed':
            model = CatBoostClassifier(loss_function='Logloss',
                                       verbose=False,
                                       learning_rate=lr,
                                       depth=depth,
                                       subsample=sample,
                                       bootstrap_type='Bernoulli',
                                       posterior_sampling=True,
                                       custom_metric='ZeroOneLoss',
                                       random_seed=seed)
        seed += 1  # new seed for each ensemble element

        model.fit(full_train_pool, eval_set=test_pool, use_best_model=False
                  )  # do not use test pool for choosing best iteration
        model.save_model("results/models/" + dataset_name + "_" + alg + "_" +
                         str(i),
                         format="cbm")
Exemplo n.º 36
0
def test_metadata():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(
        iterations=2,
        random_seed=0,
        loss_function='Logloss:border=0.5',
        metadata={"type": "AAA", "postprocess": "BBB"}
    )
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)

    model2 = CatBoost(model_file=OUTPUT_MODEL_PATH)
    assert 'type' in model2.metadata_
    assert model2.metadata_['type'] == 'AAA'
    assert 'postprocess' in model2.metadata_
    assert model2.metadata_['postprocess'] == 'BBB'
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 37
0
def main(args):
    X, y = get_gbm_database(
        args.telemetry_path,
        args.maint_path,
        args.machines_path,
        args.errors_path,
        args.failures_path,
        seq_len=args.out_seq_len,
        machine_id=args.machine_id,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

    model = CatBoostClassifier(iterations=args.n_iterations,
                               learning_rate=args.gbm_learning_rate)

    model.fit(X_train, y_train, eval_set=(X_test, y_test))
    model.save_model(args.checkpoint_path)
Exemplo n.º 38
0
def test_predict_sklearn_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 39
0
def test_priors():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, has_time=True, ctr_description=["Borders:Prior=0:Prior=0.6:Prior=1:Prior=5", "Counter:Prior=0:Prior=0.6:Prior=1:Prior=5"])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 40
0
def test_class_weights():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, class_weights=[1, 2])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 41
0
def test_classification_ctr():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, ctr_description=['Borders', 'Counter'])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)