Exemplo n.º 1
0
    def fit_predict(self, X_train, y_train, X_valid, y_valid, X_test,
                    **kwargs):
        clf = CatBoostClassifier(thread_count=30)  # TODO: embedding_features
        if self.params is not None:
            clf.set_params(**self.params)
            # print(clf.get_params())

        # eval_set = [(X_train, y_train), (X_valid, y_valid)]
        self.clf = clf.fit(
            X_train,
            y_train,
            eval_set=(
                X_valid, y_valid
            ),  # CatBoostError: Multiple eval sets are not supported on GPU
            # Only one of parameters ['verbose', 'logging_level', 'verbose_eval', 'silent'] should be set
            verbose=100,
            early_stopping_rounds=100,
            use_best_model=True,
            plot=True,
            **kwargs)
        # evals_result = self.clf.evals_result()

        valid_predict = clf.predict_proba(X_valid)
        test_predict = clf.predict_proba(X_test)
        return valid_predict, test_predict
def _test_prediction_consistency(leaf_method):
    base_dir = 'data/adult/'
    train_documents, train_targets = read_train_documents_and_one_hot_targets(
        base_dir + 'train_data_catboost_format.tsv'
    )
    train_targets = np.argmax(train_targets, axis=1)

    test_documents, test_targets = read_train_documents_and_one_hot_targets(
        base_dir + 'train_data_catboost_format.tsv'
    )

    train_dir = base_dir + 'ut_tmp/'
    if not isdir(train_dir):
        mkdir(train_dir)
    cbc_params = read_json_params(base_dir + 'catboost_params.json')
    cbc_params['leaf_estimation_method'] = leaf_method
    cbc_params['random_seed'] = 10
    cbc_params['train_dir'] = train_dir
    cbc = CatBoostClassifier(**cbc_params)
    cbc.set_params(boosting_type='Plain')
    cbc.fit(train_documents, train_targets)
    cbc.save_model(train_dir + 'model.bin', format='cbm')
    export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json')
    full_model = CBLeafInfluenceEnsemble(train_dir + 'model.json', train_documents, train_targets,
                                         learning_rate=cbc_params['learning_rate'],
                                         loss_function=BinaryCrossEntropyLoss(),
                                         leaf_method=leaf_method,
                                         update_set='AllPoints')
    assert np.allclose(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal'), rtol=0.001),\
        [(a,b)
         for a, b in zip(full_model(train_documents), cbc.predict(train_documents, prediction_type='RawFormulaVal'))
         if not np.allclose(a, b)]
    assert np.allclose(full_model(test_documents), cbc.predict(test_documents, prediction_type='RawFormulaVal'), rtol=0.001)
Exemplo n.º 3
0
def catboost_fit_predict(train_documents, train_targets, test_documents, prediction_type='RawFormulaVal',
                         **catboost_params):
    if 'gpu_ram_part' in catboost_params:
        gpu_ram_part = catboost_params.pop('gpu_ram_part')
    else:
        gpu_ram_part = None
    cbc = CatBoostClassifier(**catboost_params)
    if gpu_ram_part is not None:
        cbc.set_params(gpu_ram_part=gpu_ram_part)
    cbc.fit(train_documents, train_targets)
    return cbc.predict(test_documents, prediction_type)
def _test_influence_vs_tf_derivative(leaf_method):
    base_dir = 'data/adult/'
    train_documents, train_targets = read_train_documents_and_one_hot_targets(
        base_dir + 'train_data_catboost_format.tsv'
    )
    train_documents = train_documents[:100]
    train_targets = train_targets[:100]

    train_targets = np.argmax(train_targets, axis=1)

    test_documents, test_targets = read_train_documents_and_one_hot_targets(
        base_dir + 'test_data_catboost_format.tsv'
    )
    test_targets = np.argmax(test_targets, axis=1)

    train_dir = base_dir + 'ut_tmp/'
    if not isdir(train_dir):
        mkdir(train_dir)
    cbc_params = read_json_params(base_dir + 'catboost_params.json')
    cbc_params['iterations'] = 2
    cbc_params['leaf_estimation_method'] = leaf_method
    cbc_params['random_seed'] = 10
    cbc_params['train_dir'] = train_dir
    cbc = CatBoostClassifier(**cbc_params)
    cbc.set_params(boosting_type='Plain')
    cbc.fit(train_documents, train_targets)
    cbc.save_model(train_dir + 'model.bin', format='cbm')
    export_catboost_to_json(train_dir + 'model.bin', train_dir + 'model.json')
    full_model = CBLeafInfluenceEnsemble(train_dir + 'model.json', train_documents, train_targets,
                                         leaf_method=leaf_method,
                                         learning_rate=cbc_params['learning_rate'],
                                         loss_function=BinaryCrossEntropyLoss(),
                                         update_set='AllPoints')
    retrained_model_our = deepcopy(full_model)
    tf_checker = TFGBApplier(full_model, train_documents, train_targets, leaf_method)
    for remove_idx in np.random.randint(len(train_targets), size=30):
        full_model.fit(remove_idx, retrained_model_our)
        pred_ours = full_model(train_documents)
        pred_theirs = tf_checker.get_predicts()
        pred_cbc = cbc.predict(train_documents, prediction_type='RawFormulaVal')
        assert np.allclose(pred_ours, pred_theirs, rtol=1e-3) and np.allclose(pred_ours, pred_cbc, rtol=1e-3), (pred_ours, pred_theirs)

        der_ours = [t.leaf_values for t in retrained_model_our.influence_trees]
        der_theirs = tf_checker.get_derivs(remove_idx)
        assert all(np.allclose(o, t, rtol=1e-2) for o, t in zip(der_ours, der_theirs)), (der_ours, der_theirs)

        random_train_idx = np.random.randint(len(train_targets))
        der_pred_ours = retrained_model_our.loss_derivative(train_documents[[random_train_idx]],
                                                            train_targets[[random_train_idx]])[0]
        der_pred_theirs = tf_checker.get_train_prediction_deriv(remove_idx, random_train_idx)
        assert np.isclose(der_pred_ours, der_pred_theirs, rtol=1e-2), (der_pred_ours, der_pred_theirs)
Exemplo n.º 5
0
class CatboostEnsemble(Ensemble):
    def __init__(self, params: dict, dataset: Dataset = None):
        super().__init__(params, dataset, name='CatboostEnsemble')
        self.clf = CatBoostClassifier(**params)
        self.tmp_json_path = '/tmp/catboost.model.json'

    def fit(self, dataset: Dataset):
        self.set_dataset(dataset)

        loss_function = 'MultiClass' if self.dataset.num_classes(
        ) > 2 else 'Logloss'
        self.clf.set_params(loss_function=loss_function, verbose=False)

        self.clf.fit(self.dataset.X, self.dataset.y)

        self.clf.save_model(self.tmp_json_path, format='json')
        with open(self.tmp_json_path, 'r') as fp:
            model = json.load(fp)

        self.trees = [
            CatboostTree.parse(tree, self.dataset)
            for tree in model['oblivious_trees']
        ]

    def predict_proba(self, dataset: Dataset) -> np.ndarray:
        if len(self.trees) == 0:
            raise ValueError('There are no trees available')

        encoded_dataset = self.encode_dataset(dataset)

        n_classes = len(self.clf.classes_)  # pylint: disable=no-member

        # TODO: For single tree this is just [tree.predict(...)]
        preds = np.array(
            [tree.predict(encoded_dataset.X) for tree in self.trees])
        preds = np.sum(preds, axis=0)

        if n_classes > 2:
            # https://catboost.ai/docs/concepts/loss-functions-multiclassification.html
            # Link above suggests different equation for this
            # results_proba = softmax(preds, axis=1)
            raise NotImplementedError('Only binary problems are implemented.')
        else:
            results_proba = np.array([[1 - v, v] for v in expit(preds)])

        return results_proba

    def predict(self, dataset: Dataset) -> np.ndarray:
        results_proba = self.predict_proba(dataset)
        results_cls = np.argmax(results_proba, axis=1)
        return results_cls
Exemplo n.º 6
0
class CatBoost:
    _verbose = 200
    _train_dir = DATA_CACHE_DIR
    _is_gpu_available = get_gpu_device_count()
    _task_type = "GPU" if _is_gpu_available > 0 else None
    _devices = "GPU" if _is_gpu_available > 0 else None

    def __init__(self, model_id, num_input_features, num_output_classes,
                 model_save_path, **aux_params):
        self.model = CatBoostClassifier(loss_function="MultiClass",
                                        task_type=self._task_type,
                                        devices=self._devices,
                                        train_dir=self._train_dir,
                                        random_seed=SEED)
        self.model.set_params(**aux_params)
        self.model_id = model_id

        path = f"{model_save_path}/{model_id}"
        os.makedirs(path, exist_ok=True)
        self.model_path = path
        self.modelfile_save_path = os.path.join(path, STANDARD_MODEL_NAME)

    def load(self):
        self.model.load_model(self.modelfile_save_path)

    def save(self):
        self.model.save_model(self.modelfile_save_path)

    def fit(self, X_train, y_train, X_valid, y_valid):
        self.model.fit(Pool(X_train, y_train),
                       eval_set=(X_valid, y_valid),
                       use_best_model=True,
                       verbose=self._verbose)
        self.save()

    def predict(self, X, load=False):
        if load:
            self.load()
        return self.model.predict_proba(X)

    def explain(self, X_train, y_train, features, classes):
        importances = self.model.get_feature_importance(
            data=Pool(X_train, y_train))
        plot_importance(importances, features, self.model_path, self.model_id)
Exemplo n.º 7
0
 def modelCatBoostClassifier(self, trial: optuna.trial.Trial):
     opt_params = dict(
         num_leaves=trial.suggest_int("num_leaves", 2, 2**8),
         learning_rate=trial.suggest_discrete_uniform(
             'learning_rate', 0.001, 1, 0.001),
         n_estimators=trial.suggest_int("n_estimators", 2, 2**10, log=True),
         min_child_samples=trial.suggest_int('min_child_samples', 2, 2**8),
         min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-8,
                                                   1),
         min_split_gain=trial.suggest_loguniform('min_split_gain', 1e-8, 1),
         subsample=trial.suggest_uniform('subsample', 0.4, 1),
         subsample_freq=trial.suggest_int("subsample_freq", 0, 2**4),
         colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.4, 1),
         reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-8, 10),
         reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-8, 10),
     )
     clf = CatBoostClassifier()
     clf.set_params(**{**opt_params, **self.params})
     return clf
Exemplo n.º 8
0
class CatBoostClassifierModel(BaseModel):
    def __init__(self, categorical_features_indices, params):
        super().__init__(params)
        self.categorical_features_indices = categorical_features_indices
        self.name = 'CatBoostClassifier'
        self.cv = StratifiedKFold(5, shuffle=True, random_state=1)
        self.metrics = {
            'QWK': qwk_score,
        }

    def fit(self, X, y):
        train_pool = Pool(X, y, cat_features=self.categorical_features_indices)

        self.model = CatBoostClassifier()
        self.model.set_params(**self.params)
        self.model.fit(train_pool)

        return self.model

    def extract_shap_values(self, X):
        explainer = shap.TreeExplainer(self.model)
        shap_values = explainer.shap_values(
            Pool(X, cat_features=self.categorical_features_indices))
        return shap_values
Exemplo n.º 9
0
model_catboost_val = CatBoostClassifier(
          eval_metric='AUC',
          iterations=20000, # Very high value, to find the optimum
          od_type='Iter', # Overfitting detector set to "iterations" or number of trees
          random_seed=RS, # Random seed for reproducibility
          verbose=100) # Shows train/test metric every "verbose" trees

# "Technical" parameters of the model:
params = {'objective': 'Logloss',
		  'learning_rate': 0.01, # learning rate, lower -> slower but better prediction
		  'depth': 5, # Depth of the trees (values betwwen 5 and 10, higher -> more overfitting)
		  'l2_leaf_reg': 10, # L2 regularization (between 3 and 20, higher -> less overfitting)
		  'rsm': 0.7, # % of features to consider in each split (lower -> faster and reduces overfitting)
		  'bootstrap_type': 'Bayesian'} # For categorical variables

model_catboost_val.set_params(**params)

print('\nCatboost Fit (Validation)...\n')
model_catboost_val.fit(X=pool_tr,
                       eval_set=pool_val,
                       early_stopping_rounds=esr)

# 2) Cross-Validation (Catboost)
################################################################################

# 2.1) k-Fold Cross-Validation Function
################################################################################
from sklearn.model_selection import StratifiedKFold

def Model_cv(MODEL, k, X_train, X_test, y, RE, makepred=True, CatPos=None):
	# Create the k folds
Exemplo n.º 10
0
class CatBoostWrapper:

    params: Dict[Union[str, Any], Union[Union[str, float, int], Any]]

    def __init__(self):
        self.base_params = dict()
        self.base_params["iterations"] = 400
        #        self.base_params["used_ram_limit"] = '512mb'
        self.base_params["one_hot_max_size"] = 10
        self.base_params["nan_mode"] = 'Min'
        self.base_params["depth"] = 5
        self.base_params["learning_rate"] = 0.01
        self.base_params["random_strength"] = 1.5
        self.base_params["bagging_temperature"] = 1.5

        self.params = deepcopy(self.base_params)

        max_depth_list = (list(range(2, 11)))

        self.param_space = {
            'depth': max_depth_list,
            'learning_rate': (0.005, 0.01, 0.05, 0.1, 0.3)
        }
        self.param_rules = {}

        self.hyperopt_param_space = {
            # 'num_leaves': hp.choice('num_leaves', [5,10,20,30,50,70,100]),
            # 'subsample': hp.choice('subsample', [0.7,0.8,0.9,1]),
            # 'colsample_bytree': hp.choice('colsample_bytree', [0.5,0.6,0.7,0.8,0.9,1]),
            # 'min_child_weight': hp.choice('min_child_weight', [5,10,15,20,30,50]),
            # 'learning_rate': hp.choice('learning_rate', [0.02,0.03,0.05,0.07,0.1,0.2])
            'depth':
            hp.choice('depth', max_depth_list),
            'learning_rate':
            hp.loguniform('learning_rate', np.log(0.005), np.log(0.3))
        }

        self.estimator = None
        self.mode = None
        self.category_indices = None

    def get_regressor(self, category_indices):
        self.params["loss_function"] = "RMSE"
        self.mode = 'regression'
        self.estimator = CatBoostRegressor(**self.params)
        return self.get_model(category_indices)

    def get_classifier(self, category_indices):
        self.mode = 'classification'
        self.estimator = CatBoostClassifier(**self.params)
        return self.get_model(category_indices)

    def get_model(self, category_indices):
        # self.estimator.set_params(**params)
        self.category_indices = category_indices
        return Model(self, self.param_space, self.param_rules)

    def set_params(self, params):
        for key in params.keys():
            self.params[key] = params[key]
        self.estimator.set_params(**self.params)

    def set_final_params(self):
        pass
        # self.set_params({'learning_rate': 0.001})
        # self.num_iterations = 600
        # self.set_params({'learning_rate': 0.01})

    def fit(self, x, y=None):
        if self.mode == 'classification':
            pos_weight = x[y < 0.5].shape[0] / x[y > 0.5].shape[0]
            self.set_params({"scale_pos_weight": pos_weight})

        self.estimator.fit(x, y, logging_level='Silent', use_best_model=True)

    def predict(self, x):
        return self.estimator.predict(x)
Exemplo n.º 11
0
class CatBoostClassifierCV(object):
    """cross_val_predict"""
    def __init__(self, params=None, cv=5, random_state=None, n_repeats=None):
        self.clf = CatBoostClassifier()
        if params:
            self.clf.set_params(**params)
        if n_repeats:
            self._kf = RepeatedStratifiedKFold(cv, True, random_state)
            self._num_preds = cv * n_repeats
        else:
            self._kf = StratifiedKFold(cv, True, random_state)
            self._num_preds = cv

    def fit(self,
            X,
            y,
            X_test,
            feval=roc_auc_score,
            cat_features=None,
            sample_weight=None,
            verbose=100,
            early_stopping_rounds=100,
            plot=False,
            silent=None,
            logging_level=None,
            column_description=None,
            save_snapshot=None,
            snapshot_file='/fds/data' if cloudml else None,
            snapshot_interval=None,
            init_model=None):
        """输入数组"""

        self.oof_train = np.zeros(len(X))
        self.oof_test = np.zeros((len(X_test), self._num_preds))
        for n_fold, (train_index,
                     valid_index) in enumerate(self._kf.split(X, y)):
            if verbose:
                print("\033[94mFold %s started at %s\033[0m" %
                      (n_fold + 1, time.ctime()))
            X_train, y_train = X[train_index], y[train_index]
            X_valid, y_valid = X[valid_index], y[valid_index]
            # eval_set = [(X_train, y_train), (X_valid, y_valid)]

            ########################################################################
            self.clf.fit(X_train,
                         y_train,
                         cat_features=cat_features,
                         sample_weight=sample_weight,
                         use_best_model=True,
                         eval_set=(X_valid, y_valid),
                         verbose=verbose,
                         logging_level=logging_level,
                         plot=plot,
                         column_description=column_description,
                         silent=silent,
                         early_stopping_rounds=early_stopping_rounds,
                         save_snapshot=save_snapshot,
                         snapshot_file=snapshot_file,
                         snapshot_interval=snapshot_interval,
                         init_model=init_model)

            self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1]
            self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1]
            ########################################################################

        # 输出 测试集 oof
        self.oof_test_rank = pd.DataFrame(self.oof_test).rank().mean(1) / len(
            self.oof_test)
        self.oof_test = self.oof_test.mean(1)

        # 计算 训练集 oof 得分
        if feval:
            score = feval(y, self.oof_train)
            print(
                f"\n\033[94mCV Score: {score} ended at {time.ctime()}\033[0m")
            return score

    def oof_save(self, file='./oof_train_and_test.csv'):
        assert isinstance(file, str)
        _ = np.append(self.oof_train, self.oof_test)
        pd.DataFrame(_, columns='oof_train_and_test').to_csv(file, index=False)
Exemplo n.º 12
0
    study.optimize(objective, timeout=60 * 60 * 12)
    joblib.dump(study, 'study_{}.pkl'.format(BOOSTING))
    best_params = study.best_params

else:

    best_params = {
        'loss_function': 'Logloss',
        'custom_loss': ['AUC'],
        'logging_level': 'Silent',
        'early_stopping_rounds': 100
    }

# %%
model.set_params(**best_params)

# %%
seed_everything(RANDOM_STATE)
xx = cross_val_score_auc(model,
                         X_train,
                         y_train,
                         n_fold=N_FOLD,
                         random_state=RANDOM_STATE,
                         predict=True,
                         X_test=X_test,
                         shuffle=True,
                         split_type='stratifiedkfold',
                         return_to_stack=True,
                         submission=sample_submission)
Exemplo n.º 13
0
class CatBoost:
    def __init__(self,
                 target,
                 features,
                 weight=None,
                 mode='Regressor',
                 objective='RMSE',
                 logs=True):
        self.model = None
        self.target = target
        self.features = features
        self.mode = mode
        self.weight = weight
        self.logs = logs

        self.model_params = dict(
            thread_count=8,
            iterations=2000,
            loss_function=objective,
            # learning_rate=0.05
        )

        self.training_params = dict(use_best_model=True,
                                    early_stopping_rounds=100,
                                    verbose=100)

    def _set_model_(self):
        if self.mode == 'Regressor':
            self.model = CatBoostRegressor()
            self.model.set_params(**self.model_params)
        elif self.mode == 'Classifier':
            self.model = CatBoostClassifier()
            self.model.set_params(**self.model_params)
        else:
            raise Exception('Unknown mode %s' % self.mode)

    def train_with_valid(self, XY):
        X_train, Y_train = XY.train[self.features], XY.train[self.target]
        X_valid, Y_valid = XY.valid[self.features], XY.valid[self.target]
        if self.weight is None:
            train_pool = Pool(data=X_train, label=Y_train)
            val_pool = Pool(data=X_valid, label=Y_valid)
        else:
            W_train, W_valid = XY.train[self.weight], XY.valid[self.weight]
            train_pool = Pool(data=X_train, label=Y_train, weight=W_train)
            val_pool = Pool(data=X_valid, label=Y_valid, weight=W_valid)
        '''logging'''
        print('Training Model CatBoost with validation')
        print('X_train = %s Y_train = %s' % (X_train.shape, Y_train.shape))
        print('X_valid = %s Y_valid = %s' % (X_valid.shape, Y_valid.shape))
        print()
        '''training'''
        self._set_model_()
        self.model = self.model.fit(train_pool,
                                    eval_set=val_pool,
                                    **self.training_params)
        '''feature importances'''
        if self.logs:
            self._logging_feature_importance_(train_pool)

    def predict(self, X):
        X = X[self.features]
        if self.model is None:
            raise Exception('Train your model before')
        print('Predicting Model CatBoost')
        print('X = %s' % (X.shape, ))
        print()
        data_pool = Pool(data=X)
        '''predict'''
        if self.mode == 'Regressor':
            prediction = self.model.predict(data_pool)
        elif self.mode == 'Classifier':
            prediction = self.model.predict(data_pool,
                                            prediction_type='Probability')
            prediction = prediction[:, 1]
        prediction = pd.DataFrame(prediction,
                                  index=X.index,
                                  columns=[self.target])
        return prediction

    def _logging_feature_importance_(self, train_pool):
        if self.model is None:
            raise Exception('Train your model before')
        print('Top features')
        feature_importance = self.model.get_feature_importance(train_pool)
        feature_names = train_pool.get_feature_names()
        for score, name in sorted(zip(feature_importance, feature_names),
                                  reverse=True):
            print('{}: {}'.format(name, score))