Пример #1
0
def test_verbose_int(verbose):
    expected_line_count = {5: 3, False: 0, True: 10}
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    tmpfile = 'test_data_dumps'

    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == expected_line_count[verbose])

    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == expected_line_count[verbose])

    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
Пример #2
0
def test_cv_query():
    pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE)
    results = cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "QueryRMSE"})
    assert "train-QueryRMSE-mean" in results

    prev_value = results["train-QueryRMSE-mean"][0]
    for value in results["train-QueryRMSE-mean"][1:]:
        assert value < prev_value
        prev_value = value
Пример #3
0
def test_cv_pairs():
    pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE, pairs=QUERYWISE_TRAIN_PAIRS_FILE)
    results = cv(pool, {"iterations": 5, "random_seed": 8, "loss_function": "PairLogit"})
    assert "train-PairLogit-mean" in results

    prev_value = results["train-PairLogit-mean"][0]
    for value in results["train-PairLogit-mean"][1:]:
        assert value < prev_value
        prev_value = value
Пример #4
0
def test_cv():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    results = cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"})
    assert "train-Logloss-mean" in results

    prev_value = results["train-Logloss-mean"][0]
    for value in results["train-Logloss-mean"][1:]:
        assert value < prev_value
        prev_value = value
Пример #5
0
def test_cv():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    results = cv({"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}, pool)
    assert isinstance(results, dict)
    assert "Logloss_train_avg" in results

    prev_value = results["Logloss_train_avg"][0]
    for value in results["Logloss_train_avg"][1:]:
        assert value < prev_value
        prev_value = value
Пример #6
0
def test_cv_pairs():
    pool = Pool(QUERYWISE_TRAIN_FILE,
                column_description=QUERYWISE_CD_FILE,
                pairs=QUERYWISE_TRAIN_PAIRS_FILE)
    results = cv(pool, {
        "iterations": 5,
        "random_seed": 8,
        "loss_function": "PairLogit"
    })
    assert "train-PairLogit-mean" in results

    prev_value = results["train-PairLogit-mean"][0]
    for value in results["train-PairLogit-mean"][1:]:
        assert value < prev_value
        prev_value = value
Пример #7
0
    def fit(self, X_train, y_train):

        bst = cv(
            Pool(X_train, y_train),
            self.params
        )

        best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
        print('Best Iteration: {}'.format(best_rounds))

        self.params['iterations'] = best_rounds
        self.model = CatBoostClassifier(**self.params)

        self.model.fit(
            X_train, y_train
        )
Пример #8
0
def cv_model(params: dict, positions: tuple, date: pd.Timestamp,
             data_pool_func):
    """Кросс-валидирует модель по RMSE, нормированному на СКО набора данных
    Осуществляется проверка, что не достигнут максимум итераций, возвращается RMSE, R2 и параметры модели с оптимальным
    количеством итераций в формате целевой функции hyperopt
    Parameters
    ----------
    params
        Словарь с параметрами модели: ключ 'data' - параметры данных, ключ 'model' - параметры модели
    positions
        Кортеж тикеров, для которых необходимо осуществить кросс-валидацию
    date
        Дата, для которой необходимо осуществить кросс-валидацию
    data_pool_func
        Функция для получения catboost.Pool с данными
    Returns
    -------
    dict
        Словарь с результатом в формате hyperopt:
        ключ 'loss' - нормированная RMSE на кросс-валидации (для hyperopt),
        ключ 'status' - успешного прохождения (для hyperopt),
        ключ 'std' - RMSE на кросс-валидации,
        ключ 'r2' - 1- нормированная RMSE на кросс-валидации в квадрате,
        ключ 'data' - параметры данных,
        ключ 'model' - параметры модели, в которые добавлено оптимальное количество итераций градиентного бустинга на
        кросс-валидации и общие настройки
    """
    data_params = params["data"]
    data = data_pool_func(positions, date, **data_params)
    pool_std = np.array(data.get_label()).std()
    model_params = make_model_params(params)
    scores = catboost.cv(pool=data,
                         params=model_params,
                         fold_count=FOLDS_COUNT)
    if len(scores) == MAX_ITERATIONS:
        raise ValueError(
            f"Необходимо увеличить MAX_ITERATIONS = {MAX_ITERATIONS}")
    index = scores["test-RMSE-mean"].idxmin()
    model_params["iterations"] = index + 1
    return dict(
        loss=scores.loc[index, "test-RMSE-mean"] / pool_std,
        status=hyperopt.STATUS_OK,
        std=scores.loc[index, "test-RMSE-mean"],
        r2=1 - (scores.loc[index, "test-RMSE-mean"] / pool_std)**2,
        data=data_params,
        model=model_params,
    )
Пример #9
0
    def objective(space):
        global best_score, trials_count
        #       if os.path.isdir('./catboost_info'):
        #           shutil.rmtree('./catboost_info', ignore_errors=True)
        trials_count += 1
        if (trials_count % 5) == 0 and is_quit_pressed():
            raise co.TennisAbortError
        args_dct = dict(**space)
        params = {
            "eval_metric": metric_name,
            # 'eval_metric': 'Logloss',
            "random_seed": random_state,
            "logging_level": "Silent",
        }
        params.update(args_dct)
        if how == "cv":
            cv_data = cv(pools.train, params, stratified=True)
            scr_val = np.max(cv_data[f"test-{metric_name}-mean"])
        elif how == "sklearn":
            mdl = CatBoostClassifier(**params)
            mdl.fit(pools.train)
            pred = mdl.predict_proba(pools.eval)[:, 1]
            scr_val = roc_auc_score(pools.eval.y, pred)
        elif how == "native":
            mdl = CatBoost(params)
            mdl.fit(
                pools.train,
                eval_set=None,  # pools.eval if pools.eval else None,
                silent=True,
            )  # eval_set=pools.eval
            pred = mdl.predict(pools.eval, prediction_type="Probability")[:, 1]
            scr_val = roc_auc_score(pools.eval.get_label(), pred)
        else:
            raise Exception("bad how arg {}".format(how))

        #       pred = mdl.predict(data.X_test)
        #       scr_val = precision_score(data.y_test, pred)

        if scr_val > best_score:
            if how == "cv":
                cco.out("achieved best {} at {}".format(scr_val, params))
            else:
                cco.out("achieved best {} at {} lrate: {} ntrees: {}".format(
                    scr_val, mdl.get_params(), mdl.learning_rate_,
                    mdl.tree_count_))
            best_score = scr_val
        return {"loss": 1.0 - scr_val, "status": STATUS_OK}
Пример #10
0
 def evaluate_model(self):
     validation_scores = catboost.cv(
         catboost.Pool(self.X_train,
                       self.y_train,
                       cat_features=self.categorical_columns_indices),
         self.model.get_params(),
         nfold=self.n_fold,
         stratified=self.is_stratified,
         seed=self.seed,
         early_stopping_rounds=self.early_stopping_rounds,
         shuffle=self.is_shuffle,
         # metrics= 'RMSE',
         plot=False)
     self.scores = validation_scores
     test_scores = validation_scores.iloc[:, 2]
     best_metric = test_scores.min()
     return best_metric
Пример #11
0
def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=1000,
        eval_metric='F1',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )

    cv_data = cv(
        Pool(data, data_label, cat_features=categorical_features_indices),
        model.get_params())
    best_f1 = np.max(cv_data['test-F1-mean'])

    return 1 - best_f1
def cv_catboost(data):
    data = pd.DataFrame(data)
    X_tr = data.drop(3, axis=1)
    y_tr = data[3]

    params = {
        "iterations": 200,
        "depth": 2,
        "loss_function": "RMSE",
        "verbose": True
    }

    cv_dataset = Pool(data=X_tr, label=y_tr)

    scores = cv(cv_dataset, params, fold_count=5, plot="True")

    return scores
Пример #13
0
    def eval_train(self):

        cv_params = self.model.get_params()
        cv_params.update({'loss_function': 'Logloss'})
        cv_data = cv(
            Pool(self.X, self.y),
            cv_params,
            #plot=True
        )
        print(
            'Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
                np.max(cv_data['test-Accuracy-mean']),
                cv_data['test-Accuracy-std'][np.argmax(
                    cv_data['test-Accuracy-mean'])],
                np.argmax(cv_data['test-Accuracy-mean'])))

        print('Precise validation accuracy score: {}'.format(
            np.max(cv_data['test-Accuracy-mean'])))
Пример #14
0
    def _model_cross_validation(self):
        self.cv_data = cv(Pool(self.X,
                               self.y,
                               cat_features=self.categorical_features_indices),
                          self.model.get_params(),
                          plot=False)

        # Now we have values of our loss functions at each boosting step averaged by 10 folds,
        # which should provide us with a more accurate estimation of our model performance:
        print(
            'Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
                np.max(self.cv_data['test-Accuracy-mean']),
                self.cv_data['test-Accuracy-std'][np.argmax(
                    self.cv_data['test-Accuracy-mean'])],
                np.argmax(self.cv_data['test-Accuracy-mean'])))

        print('Precise validation accuracy score: {}'.format(
            np.max(self.cv_data['test-Accuracy-mean'])))
Пример #15
0
        def objective(space_params):

            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])

            #extract nested conditional parameters
            if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
                bagging_temp = space_params['bootstrap_type'].get(
                    'bagging_temperature')
                space_params['bagging_temperature'] = bagging_temp

            if space_params['grow_policy']['grow_policy'] == 'LossGuide':
                max_leaves = space_params['grow_policy'].get('max_leaves')
                space_params['max_leaves'] = int(max_leaves)

            space_params['bootstrap_type'] = space_params['bootstrap_type'][
                'bootstrap_type']
            space_params['grow_policy'] = space_params['grow_policy'][
                'grow_policy']

            #random_strength cannot be < 0
            space_params['random_strength'] = max(
                space_params['random_strength'], 0)
            #fold_len_multiplier cannot be < 1
            space_params['fold_len_multiplier'] = max(
                space_params['fold_len_multiplier'], 1)

            #for classification set stratified=True
            cv_results = cb.cv(train,
                               space_params,
                               fold_count=N_FOLDS,
                               early_stopping_rounds=25,
                               stratified=False,
                               partition_random_seed=42)

            best_loss = cv_results['test-MAE-mean'].iloc[
                -1]  #'test-RMSE-mean' for RMSE
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = cv_results['test-Logloss-mean'].iloc[-1]
            #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean'

            return {'loss': best_loss, 'status': STATUS_OK}
Пример #16
0
def CAT(data, label):
    pool = cat.Pool(data, label, has_header=False)
    params = {
        "loss_function": 'MultiClassOneVsAll', "eval_metric": 'MultiClassOneVsAll', "max_depth": 7,
        "learning_rate": 0.2, "classes_count": num_class, "task_type": 'CPU', "thread_count": 6, "verbose_eval": False}
    before = datetime.datetime.now()
    results = cat.cv(pool=pool, params=params, num_boost_round=boost_rounds, fold_count=cv_fold, shuffle=True,
                     stratified=True,
                     verbose=False)
    after = datetime.datetime.now()
    print("CatBoost")
    print("najlepsi priemer: " + str(1 - min(results['test-MultiClassOneVsAll-mean'])))
    print("index najlepsieho: " + str(results['test-MultiClassOneVsAll-mean'][results['test-MultiClassOneVsAll-mean']
                                                                              == min(
        results['test-MultiClassOneVsAll-mean'])].index[0]))
    print("najhorsi priemer: " + str(1 - max(results['test-MultiClassOneVsAll-mean'])))
    print("finalny priemer: " + str(1 - results['test-MultiClassOneVsAll-mean'].iloc[-1]))
    print("cas: " + str(after - before))
    print('\n')
Пример #17
0
def cross_validation(
    clf,
    pool: Pool,
    metric_name,
    fold_count=5,
    stratified=True,
    early_stopping_rounds=None,
    plot=False,
):
    """metric_name: 'AUC', 'Precision', 'Accuracy'...  Output as sample:
    Best test Precision score: 0.72+-0.25 on step 22
    Best test AUC score: 0.57+-0.01 on step 23
    Best validation Logloss score: 0.69+-0.00 on step 0
    """
    assert isinstance(pool, Pool)
    print(f"cross_validation start with metric {metric_name} "
          f"fold_count {fold_count} stratified {stratified} "
          f"early_stopping {early_stopping_rounds}")
    cv_params = clf.get_params()
    cv_params.update({"loss_function": "Logloss"})
    cv_data = cv(
        pool,
        cv_params,
        fold_count=fold_count,
        stratified=stratified,
        plot=plot,
        early_stopping_rounds=early_stopping_rounds,
    )
    test_metric_pref = "test-" + metric_name
    print("Best test {} score: {:.2f}+-{:.2f} on step {}".format(
        metric_name,
        np.max(cv_data[test_metric_pref + "-mean"]),
        cv_data[test_metric_pref + "-std"][np.argmax(cv_data[test_metric_pref +
                                                             "-mean"])],
        np.argmax(cv_data[test_metric_pref + "-mean"]),
    ))

    print("Best validation Logloss score: {:.2f}+-{:.2f} on step {}".format(
        np.max(cv_data["test-Logloss-mean"]),
        cv_data["test-Logloss-std"][np.argmax(cv_data["test-Logloss-mean"])],
        np.argmax(cv_data["test-Logloss-mean"]),
    ))
Пример #18
0
def make_catboost(conf: dict, df, log_path):
    X_new = univariate_feature_selection(df['X'], df['y'], conf['feature_selection']['k_best_features'])

    logger.info('{} is running. X shape = {}'.format('->'.join(log_path), X_new.shape))
    cv_dataset = Pool(data=X_new,
                      label=df['y'])

    params = {"loss_function": "Logloss",
              "early_stopping_rounds": 30,
              "verbose": True,
              "custom_metric": ["Accuracy", "F1", "Recall", "Precision"],
              "eval_metric": 'F1'}

    scores = cv(cv_dataset,
                params,
                fold_count=5,
                verbose=False)
    logger.info('{} is done. output = {}'.format('->'.join(log_path), scores['test-F1-mean'].max()))
    # clf.fit(X_new, y)
    return scores['test-F1-mean'].max()
Пример #19
0
    def train_all_save_catboost(self, X, y, categorical_features_indices):
        """train whole data and save the training to be use later in new predictions"""
        model = CatBoostClassifier(loss_function='MultiClass',
                                   eval_metric='TotalF1',
                                   random_seed=42,
                                   leaf_estimation_method='Newton')
        cv_data = cv(Pool(X, y, cat_features=categorical_features_indices),
                     model.get_params())
        print("precise validation accuracy score:{}".format(np.max(cv_data)))
        model.fit(X, y, cat_features=categorical_features_indices)

        #feature importance
        print(model.get_feature_importance(prettified=True))
        # train = Pool(X, y, cat_features=categorical_features_indices)
        # feature_importances = model.get_feature_importance(train)
        # feature_names = X.columns
        # for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        #     print('{}: {}'.format(name, score))

        model.save_model('catboost_model.dump')
        print("Catboost model has been saved!")
Пример #20
0
def train_test_dataset(X, y, cat_features=None, auto_class_weights=None, 
                       loss_function='Logloss', iterations=2000, metrics=['AUC']):
    pool = Pool(data=X,
                label=y,
                cat_features=cat_features)
    params = {
        'task_type': 'CPU',
        'auto_class_weights': auto_class_weights,
        'custom_metric': metrics,
        'verbose': False,
        'loss_function': loss_function,
        'iterations': iterations
    }
    
    return cv(
        pool,
        params,
        fold_count=5,
        plot=True,
        logging_level='Info'
    )
Пример #21
0
 def model_cv(self, model, folds):
     '''Run model cross-validation'''
     cv_params = model.get_params()
     
     print('{}\nStart model crossvalidation proccess'.format(split_line))
     cv_data = cv(
         self.pool,
         cv_params,
         fold_count = folds,
         #iterations = 800,
         verbose = 200,
         early_stopping_rounds = 20
         )
     
     print('Best validation accuracy score: {:.4f}±{:.4f} on step {}'\
           .format(
               np.min(cv_data['test-RMSE-mean']),
               cv_data['test-RMSE-std'][np.argmax(cv_data['test-RMSE-mean'])],
               np.argmax(cv_data['test-RMSE-mean'])
               )
           )
Пример #22
0
    def cv(self,
           params_model=None,
           nfold=5,
           num_boost_round=10000,
           early_stopping_rounds=100,
           **kwargs):

        # If no params_model is given, take self.params_best_fit
        if params_model is None:
            params_model = self.params_best_fit

        dtrain = self.get_train_set(as_cgb_pool=True)

        eval_hist = cgb.cv(params=params_model,
                           dtrain=dtrain,
                           nfold=nfold,
                           verbose_eval=True,
                           num_boost_round=num_boost_round,
                           early_stopping_rounds=early_stopping_rounds,
                           **kwargs)

        return eval_hist
Пример #23
0
        def hyperopt_objective(params):
            X_train = self.X_train
            y_train = self.y_train
            categorical_features_indices = self.cat_index
            model = CatBoostRegressor(
                l2_leaf_reg=int(params['l2_leaf_reg']),
                learning_rate=params['learning_rate'],
                depth=params['tree_depth'],
                #iterations=500,
                eval_metric='RMSE',
                #use_best_model=True,
                random_seed=42,
                logging_level='Silent')

            cv_data = cv(params=model.get_params(),
                         pool=Pool(X_train,
                                   y_train,
                                   cat_features=categorical_features_indices))
            #print(cv_data)
            best_rmse = np.min(cv_data['test-RMSE-mean'])
            print('params is', params, 'rmse is ', best_rmse)
            return best_rmse  # as hyperopt minimises
def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        #learning_rate=params['learning_rate'],
        depth=params['depth'],
        iterations=500,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=40,
        random_seed=42,
        logging_level='Silent',
        allow_writing_files=False
    )
    
    cv_data = cv(
        train_pool,
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])    
    
    print(params, best_accuracy)
    return 1 - best_accuracy # as hyperopt minimises
Пример #25
0
    def ctb_crossval(self, params, optim_type):
        '''catboost cross validation model
        Paramters
        ---------
        params: Hyper parameters in dict type from different optimization methods
        optim_type: choose among Optuna, Hyperopt, RandomSearch
        Returns
        ------
        Loss, params, n_estimator, run_time'''
        # initializing the timer

        start = timer()
        print('trial using : ', params)

        cv_results = cb.cv(self.train_set,
                           params,
                           fold_count=N_FOLDS,
                           num_boost_round=NUM_BOOST_ROUNDS,
                           early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                           stratified=True,
                           partition_random_seed=SEED,
                           verbose_eval=True,
                           plot=False)
        # store the runtime
        run_time = timer() - start
        # Extract the best score
        best_score = np.max(cv_results['test-F1-mean'])
        # Loss must be minimized
        loss = 1 - best_score

        # Boosting rounds that returned the highest cv score
        n_estimators = int(np.argmax(cv_results['test-F1-mean']) + 1)
        if loss < self.loss:
            self.estimator = n_estimators
            self.loss = loss

        #print(params)

        return loss, params, n_estimators, run_time
Пример #26
0
    def cross_val(self,
                  nfold=3,
                  shuffle=True,
                  stratified=None,
                  plot=True,
                  partition_random_seed: int = 14):
        """

        :param nfold:
        :param shuffle:
        :param stratified:
        :param plot:
        :param partition_random_seed:
        :return:
            cv results : pandas.core.frame.DataFrame with cross-validation results
            columns are: test-error-mean  test-error-std  train-error-mean  train-error-std
        """
        from catboost import Pool, cv
        import numpy as np
        features, labels, cat_cols = self._data_processor.cv_input_fn()
        cv_data = Pool(data=features, label=labels, cat_features=cat_cols)
        cv_result = cv(cv_data,
                       self._params,
                       nfold=nfold,
                       shuffle=shuffle,
                       stratified=stratified,
                       plot=plot,
                       partition_random_seed=partition_random_seed)
        print('Best validation {} score: {:.2f}±{:.2f} on step {}'.format(
            self._params['custom_metric'],
            np.max(cv_result[f'test-{self._params["custom_metric"]}-mean']),
            cv_result[f'test-{self._params["custom_metric"]}-std'][np.argmax(
                cv_result[f'test-{self._params["custom_metric"]}-mean'])],
            np.argmax(
                cv_result[f'test-{self._params["custom_metric"]}-mean'])))
        print('Precise validation {} score: {}'.format(
            self._params['custom_metric'],
            np.max(cv_result[f'test-{self._params["custom_metric"]}-mean'])))
        return cv_result
def catboosttrainer(X,y,features,initparam,modelname,modelpath,docpath,cvfold = 5):
    print ("searching for optimal iteration count...")
    trainpool = cat.Pool(X[features],y)
    cvresult = cat.cv(params= initparam, fold_count=cvfold, pool=trainpool,stratified = True)
    initparam['iterations'] = (len(cvresult)) - (initparam['od_wait']+1)   
    del initparam['od_wait'] 
    del initparam['od_type']
    print ("optimal iteration count is ", initparam['iterations'])
    print ("fitting model...")
    clf = cat.CatBoostClassifier(** initparam)
    clf.fit(trainpool)
    imp = clf.get_feature_importance(trainpool,fstr_type='FeatureImportance')
    dfimp = pd.DataFrame(imp,columns = ['CatBoostImportance'])
    dfimp.insert(0,column='Feature', value=features) 
    dfimp = dfimp.sort_values(['CatBoostImportance','Feature'], ascending= False)
    xlsxpath = os.path.join(docpath,modelname+".xlsx")
    dfimp.to_excel(xlsxpath)
    print ("pickling model...")
    picklepath = os.path.join(modelpath,modelname)
    with open(picklepath,'wb') as fout:
        pickle.dump(clf, fout)
    return cvresult,clf,initparam,dfimp
Пример #28
0
        def objetive(trial):

            params.update({
                "boosting_type":
                trial.suggest_categorical("boosting_type",
                                          ['Ordered', 'Plain']),
                "learning_rate":
                trial.suggest_loguniform("learning_rate", 0.005, 0.1),
                "max_depth":
                trial.suggest_int("max_depth", 4, 12),
                "l2_leaf_reg":
                trial.suggest_loguniform("l2_leaf_reg", 1e-4, 1e4),
                "border_count":
                trial.suggest_int('border_count', 1, 255),
                "random_strength":
                trial.suggest_loguniform("random_strength", 1e-4, 1e4),
                "bagging_temperature":
                trial.suggest_loguniform("bagging_temperature", 1e-4, 1e4),
            })

            cv_results = catboost.cv(params=params,
                                     pool=d_train,
                                     iterations=10000,
                                     early_stopping_rounds=50,
                                     folds=folds,
                                     verbose_eval=None,
                                     as_pandas=False)

            rmetric_name = list(cv_results.keys())[1]
            score = cv_results[rmetric_name][
                -1]  # np.min(cv_results[rmetric_name])

            print("Num_boost_round: " + str(len(cv_results[rmetric_name])))

            if save_study_as is not None:
                joblib.dump(study, save_study_as)

            return score
Пример #29
0
def cgb_fit(config, X_train, y_train):
    """模型(交叉验证)训练,并返回最优迭代次数和最优的结果。
    Args:
        config: xgb 模型参数 {params, max_round, cv_folds, early_stop_round, seed, save_model_path}
        X_train:array like, shape = n_sample * n_feature
        y_train:  shape = n_sample * 1

    Returns:
        best_model: 训练好的最优模型
        best_auc: float, 在测试集上面的 AUC 值。
        best_round: int, 最优迭代次数。
    """
    params = config.params
    max_round = config.max_round
    cv_folds = config.cv_folds
    seed = config.seed
    save_model_path = config.save_model_path
    if cv_folds is not None:
        dtrain = cgb.Pool(X_train, label=y_train)
        cv_result = cgb.cv(dtrain, params, num_boost_round=max_round, nfold=cv_folds, seed=seed, logging_level='Verbose')
        # 最优模型,最优迭代次数
        auc_test_avg = cv_result['AUC_test_avg']
        best_round = np.argmax(auc_test_avg)
        best_auc = np.max(auc_test_avg)  # 最好的 auc 值
        best_model = cgb.train(dtrain, params, num_boost_round=best_round)
    else:
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=100)
        dtrain = cgb.Pool(X_train, label=y_train)
        dvalid = cgb.Pool(X_valid, label=y_valid)
        best_model = cgb.train(params, dtrain, num_boost_round=max_round, eval_set=dvalid)
        best_round = best_model.best_iteration
        best_auc = best_model.best_score
        cv_result = None
    if save_model_path:
        check_path(save_model_path)
        pickle.dump(best_model, open(save_model_path, 'wb'))
    return best_model, best_auc, best_round, cv_result
def cbfunc(border_count, l2_leaf_reg, depth, learning_rate):
    params = {
        'eval_metric': 'MAE',  # using MAE here, could also be RMSE or MSE
        'early_stopping_rounds': esrounds,
        'num_boost_round': brounds,
        'use_best_model': True,
        'task_type': "GPU"
    }

    params['border_count'] = round(border_count, 0)
    params['l2_leaf_reg'] = l2_leaf_reg
    params['depth'] = round(depth, 0)
    params['learning_rate'] = learning_rate

    # Cross validation
    cv_results = cb.cv(cb.Pool(xtrain, ytrain, cat_features=cat_features),
                       params=params,
                       fold_count=3,
                       inverted=False,
                       partition_random_seed=5,
                       shuffle=True,
                       logging_level='Silent')
    # bayes_opt MAXIMISES: In order to minimise MAE, I use 1/MAE as target value
    return 1 / cv_results['test-MAE-mean'].min()
Пример #31
0
 def do_cv(learning_rate, depth, l2_leaf_reg):
     param = {
         'iterations': 3000,
         'od_type': 'Iter',
         'od_wait': 50,
         'learning_rate': learning_rate,
         'depth': depth,
         'l2_leaf_reg': l2_leaf_reg,
         'loss_function': 'MAE',
         'eval_metric': 'MAE',
     }
     eval_hist = catboost.cv(param, catboost.Pool(train_x, train_y,
                                                  cat_inds), 5)
     res_list = [
         eval_hist['MAE_test_avg'][-1],
         eval_hist['MAE_test_stddev'][-1],
         len(eval_hist['MAE_test_avg']),
         param['learning_rate'],
         param['depth'],
         param['l2_leaf_reg'],
     ]
     line = '%.7f,%.7f,%.0f,%.6f,%.0f,%.0f' % tuple(res_list)
     write_to_file(line)
     return res_list
Пример #32
0
    def cv(self, data, clf=None):
        setseed(self.seed)
        if clf:
            train_X, train_y = \
                data[self.features_name].values,\
                data['validRevenue'].values
        else:
            train_X, train_y = \
                data[self.features_name].values,\
                data['totals_transactionRevenue'].values

        cat_train = cat.Pool(data=train_X,
                             label=train_y,
                             feature_names=self.features_name,
                             cat_features=self.categorical_feature)

        cat_cv_hist = cat.cv(
            pool=cat_train,
            params=self.params,
            #                             num_boost_round = self.num_boost_round,
            nfold=self.nfold,
            seed=self.seed)

        return cat_cv_hist
Пример #33
0
def test_verbose_int():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    tmpfile = 'test_data_dumps'

    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=5)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 2)
    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=False)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)
    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=True)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 10)

    log_files = []
    for i in range(3):
        log_files.append(JSON_LOG_PATH[:-5]+str(i)+JSON_LOG_PATH[-5:])

    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[0]}, verbose=5)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 2)
    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[1]}, verbose=False)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)
    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[2]}, verbose=True)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 10)

    canonical_files = []

    for log_file in log_files:
        canonical_files.append(local_canonical_file(remove_time_from_json(log_file)))
    return canonical_files
Пример #34
0
y_pred = model.predict(test_pool)

# 模型评估
print(f'R2: {r2_score(y_test, y_pred)}')

# 特征重要性
print(f'feature_importance = {model.get_feature_importance(train_pool)}')

# 保存模型
model.save_model('regression.cbm', format="cbm")

# Calculate the RMSE metric for the objects in the given dataset.
print(f'score: {model.score(X_train, y_train)}')

# cv
args = config.pop('calc_feature_importance', None)
print(config)
config1 = {
    'pool': train_pool,
    'params': config,
    'iterations': 1000,
    'fold_count': 10,
    'partition_random_seed': 120,
    'logging_level': 'Verbose',
    'stratified': False,
    'as_pandas': True
}

scores = cv(**config1)
print(f'CV result is: {scores}')
Пример #35
0
X_train = train[features].fillna('')
y_train = train['Survived']
X_test = test[features].fillna('')

model = catboost.CatBoostClassifier(one_hot_max_size=4,
                                    iterations=100,
                                    random_seed=0,
                                    verbose=False,
                                    eval_metric='Accuracy')

pool = catboost.Pool(X_train, y_train, cat_features=[0, 2])
print(
    'To see the Catboost plots, fork this kernel and run it in the editing mode.'
)
cv_scores = catboost.cv(pool, model.get_params(), fold_count=10, plot=True)
print('CV score: {:.5f}'.format(cv_scores['test-Accuracy-mean'].values[-1]))

# You can check yourself the public LB score of this model (0.77990) by submitting the file `submission2.csv` from the section Output of this kernel.

# In[ ]:

model.fit(pool)
pred = model.predict(X_test).astype('int')
output = pd.concat(
    [test['PassengerId'],
     pd.DataFrame(pred, columns=['Survived'])], axis=1)
output.to_csv('submission2.csv', index=False)

# Next I consider the generated feature `Boy`. It takes the value 1 if the title in `Name` is "Master" and the value 0 otherwise.
#
Пример #36
0
def test_cv_with_not_binarized_target():
    train_file = data_file('adult_not_binarized', 'train_small')
    cd = data_file('adult_not_binarized', 'train.cd')
    pool = Pool(train_file, column_description=cd)
    cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"})
    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
Пример #37
0
def test_bad_params_in_cv():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    with pytest.warns(UserWarning):
        cv({"iterations": 5, "random_seed": 0, "loss_function": "Logloss", "use_best_model": True}, pool)
Пример #38
0
def test_cv_with_not_binarized_target():
    train_file = data_file('adult_not_binarized', 'train_small')
    cd = data_file('adult_not_binarized', 'train.cd')
    pool = Pool(train_file, column_description=cd)
    cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"})
    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
Пример #39
0
def test_cv_logging():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"})
    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))