Пример #1
0
def test_silent():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    tmpfile = 'test_data_dumps'

    with LogStdout(open(tmpfile, 'w')):
        model = CatBoost(dict(iterations=10, silent=True))
        model.fit(pool)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)

    with LogStdout(open(tmpfile, 'w')):
        model = CatBoost(dict(iterations=10, silent=True))
        model.fit(pool, silent=False)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 10)

    with LogStdout(open(tmpfile, 'w')):
        train(pool, {'silent': True})
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)

    with LogStdout(open(tmpfile, 'w')):
        model = CatBoost(dict(iterations=10, silent=False))
        model.fit(pool, silent=True)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)

    with LogStdout(open(tmpfile, 'w')):
        model = CatBoost(dict(iterations=10, verbose=5))
        model.fit(pool, silent=True)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)
Пример #2
0
def cat_cv(train, test, params, fit_params, cat_features, feature_names, nfold,
           seed):
    train.Pred = pd.DataFrame({
        'id': train['样本id'],
        'true': train['收率'],
        'pred': np.zeros(len(train))
    })
    test.Pred = pd.DataFrame({'id': test['样本id'], 'pred': np.zeros(len(test))})
    kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)
    cat_tst = cat.Pool(data=test[feature_names],
                       cat_features=cat_features,
                       feature_names=feature_names)
    for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train['收率'])):
        print(f'\nFold_{fold_id} Training ================================\n')
        cat_trn = cat.Pool(data=train.iloc[trn_idx][feature_names],
                           label=train.iloc[trn_idx]['收率'],
                           cat_features=cat_features,
                           feature_names=feature_names)
        cat_val = cat.Pool(data=train.iloc[val_idx][feature_names],
                           label=train.iloc[val_idx]['收率'],
                           cat_features=cat_features,
                           feature_names=feature_names)
        cat.train(params=params, pool=cat_trn, **fit_params, eval_set=cat_val)
        val_pred = cat.predict(train.iloc[val_idx][feature_names])
        train.Pred.loc[val_idx, 'pred'] = val_pred
        print(f'Fold_{fold_id}', mse(train.iloc[val_idx]['收率'], val_pred))
        test.Pred['pred'] += cat.predict(cat_tst) / nfold
    print('\n\nCV LOSS:', mse(train.Pred['true'], train.Pred['pred']))
    return test.Pred
Пример #3
0
def test_verbose_int(verbose):
    expected_line_count = {5: 2, False: 0, True: 10}
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    tmpfile = 'test_data_dumps'

    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {
            "iterations": 10,
            "random_seed": 0,
            "loss_function": "Logloss"
        },
           verbose=verbose)
    with open(tmpfile, 'r') as output:
        assert (sum(1 for line in output) == expected_line_count[verbose])

    with LogStdout(open(tmpfile, 'w')):
        train(pool, {
            "iterations": 10,
            "random_seed": 0,
            "loss_function": "Logloss"
        },
              verbose=verbose)
    with open(tmpfile, 'r') as output:
        assert (sum(1 for line in output) == expected_line_count[verbose])

    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
Пример #4
0
def test_catboost_numerical_validation():
    ds = vaex.ml.datasets.load_iris()
    features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width']

    # Vanilla catboost
    dtrain = cb.Pool(ds[features].values, label=ds.data.class_)
    cb_bst = cb.train(params=params_multiclass,
                      dtrain=dtrain,
                      num_boost_round=3)
    cb_pred = cb_bst.predict(dtrain, prediction_type='Probability')

    # catboost through vaex
    booster = vaex.ml.catboost.CatBoostModel(features=features,
                                             params=params_multiclass,
                                             num_boost_round=3)
    booster.fit(ds, ds.class_)
    vaex_pred = booster.predict(ds)

    # Comparing the the predictions of catboost vs vaex.ml
    np.testing.assert_equal(
        vaex_pred,
        cb_pred,
        verbose=True,
        err_msg=
        'The predictions of vaex.ml.catboost do not match those of pure catboost'
    )
Пример #5
0
def cgb_cv(df, features, categorical_features, n_folds, param):
    kf = GroupKFold(n_splits=n_folds)
    group_map = dict(zip(np.arange(1, 13),
                         pd.cut(np.arange(1, 13), n_folds, labels=np.arange(n_folds))))
    group = df.timestamp.dt.month.map(group_map)

    models = []
    train_scores = []
    valid_scores = []

    for train_index, val_index in kf.split(df, df['building_id'], groups=group):
        train_X, train_y = df[features].iloc[train_index], df['meter_reading'].iloc[train_index]
        val_X, val_y = df[features].iloc[val_index], df['meter_reading'].iloc[val_index]

        cgb_train = cgb.Pool(train_X, train_y, cat_features=categorical_features)
        cgb_eval = cgb.Pool(val_X, val_y, cat_features=categorical_features)
        gbm = cgb.train(cgb_train, param, eval_set=cgb_eval, verbose=20)

        train_preds = gbm.predict(train_X)
        if use_log1p_target:
            train_preds = np.expm1(train_preds)
            train_y = np.expm1(train_y)
        train_scores.append(rmsle(train_y, train_preds))

        valid_preds = gbm.predict(val_X)
        if use_log1p_target:
            valid_preds = np.expm1(valid_preds)
            val_y = np.expm1(val_y)
        valid_scores.append(rmsle(val_y, valid_preds))

        models.append(gbm)
    return train_scores, valid_scores, models
Пример #6
0
    def fit(self, data, clf=None):
        setseed(self.seed)
        train_df = data[data['visitId'].\
                        apply(lambda x: x.date()) >= datetime.date(2016,9,30)]
        val_df = data[data['visitId'].\
                      apply(lambda x: x.date()) < datetime.date(2016,9,30)]
        if clf:
            train_X, train_y = \
                train_df[self.features_name].values,\
                train_df['validRevenue'].values\

            val_X, val_y = \
                val_df[self.features_name].values,\
                val_df['validRevenue'].values
        else:
            train_X, train_y = \
                train_df[self.features_name].values,\
                train_df['totals_transactionRevenue'].values

            val_X, val_y = \
                val_df[self.features_name].values,\
                val_df['totals_transactionRevenue'].values


#        x_train, x_eval, y_train, y_eval = \
#                train_test_split(df_x, df_y,
#                             test_size = self.test_ratio,
#                             random_state = self.seed)

        cat_train = cat.Pool(data=train_X,
                             label=train_y,
                             feature_names=self.features_name,
                             cat_features=self.categorical_feature)

        cat_eval = cat.Pool(data=val_X,
                            label=val_y,
                            feature_names=self.features_name,
                            cat_features=self.categorical_feature)

        model_param = self.params['params_clf'] if clf else self.params[
            'params_reg']

        self.estimator = cat.train(
            params=model_param,
            pool=cat_train,
            eval_set=cat_eval,
            #                num_boost_round = self.num_boost_round,
            #                learning_rate = self.params['learning_rate'],
            #                max_depth = self.params['max_depth'],
            #                l2_leaf_reg = self.params['l2_leaf_reg'],
            #                rsm = self.params['colsample_ratio'],
            #                subsample = self.params['subsample_ratio'],
            #                class_weights = self.params['class_weights'],
            #                loss_function = self.loglikeloss,
            #                custom_loss = self.loglikeloss,
            #                custom_metric = self.roc_auc_error,
            #                eval_metric = self.roc_auc_error
        )

        return self
Пример #7
0
def cat_cv_train(X_train, y_train, params, kfold):
    '''
    taking feature and target dataset and using cross-validation method to train model, return list of classifiers, length of list is the number of clfs for each fold.
    '''
    from catboost import train, Pool
    from sklearn.model_selection import train_test_split, KFold
    
    features = [feature for feature in X_train.columns if feature not in ['target', 'card_id', 'first_active_month']]
    categorical_features = [feature for feature in features if 'feature_' in feature]
    folds = KFold(n_splits=kfold, shuffle=True, random_state=42)
    clf_list = []
    
    for train_idxs, val_idxs in folds.split(X_train.values, y_train.values):
        
        # training set
        train_set = Pool(data=X_train.iloc[train_idxs][features],\
                         label=y_train.iloc[train_idxs],\
                         cat_features=categorical_features)
        # validation set
        valid_set = Pool(data=X_train.iloc[val_idxs][features],\
                         label=y_train.iloc[val_idxs],\
                         cat_features=categorical_features)

        # train clf
        clf = train(pool=train_set,\
                    params=params,\
                    verbose=100,\
                    iterations=10000,\
                    eval_set=valid_set)
        
        # add current clf to clf_list
        clf_list.append(clf)
        
    return clf_list
def main():
    print("load train test datasets")
    train, test = load_train_test()

    y_train_all = train['orderType']
    id_test = test['userid']
    del train['orderType']

    df_columns = train.columns.values
    print('===> feature count: {}'.format(len(df_columns)))

    cat_params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': 0.1,
        'l2_leaf_reg': 5,  # L2 regularization coefficient.
        'subsample': 0.9,
        'depth': 8,  # Depth of the tree
        'border_count': 255,  # The number of splits for numerical features
        'thread_count': 6,
        'train_dir': 'catboost_train_logs',
        'bootstrap_type': 'Bernoulli',
        'use_best_model': True,
        'random_seed': 42
    }

    pool = Pool(train, y_train_all)

    cv_results = cat.train(pool=pool,
                           params=cat_params,
                           num_boost_round=4000,
                           nfold=5,
                           seed=42,
                           stratified=True)
Пример #9
0
    def _cv(self, dataset: tk.data.Dataset,
            folds: tk.validation.FoldsType) -> None:
        import catboost

        assert isinstance(dataset.data, pd.DataFrame)

        self.train_pool_ = catboost.Pool(
            data=dataset.data,
            label=dataset.labels,
            group_id=dataset.groups,
            feature_names=dataset.data.columns.values.tolist(),
            cat_features=dataset.data.select_dtypes("object").columns.values,
        )

        self.gbms_, score_list = [], []
        for fold, (train_indices, val_indices) in enumerate(folds):
            with tk.log.trace(f"fold{fold}"):
                gbm = catboost.train(
                    params=self.params,
                    pool=self.train_pool_.slice(train_indices),
                    eval_set=self.train_pool_.slice(val_indices),
                    **(self.cv_params or {}),
                )
                self.gbms_.append(gbm)
                score_list.append(gbm.get_best_score()["validation"])

        cv_weights = [len(val_indices) for _, val_indices in folds]
        evals: tk.evaluations.EvalsType = {}
        for k in score_list[0]:
            score = [s[k] for s in score_list]
            score = np.float32(np.average(score, weights=cv_weights))
            evals[k] = score
        logger.info(f"cv: {tk.evaluations.to_str(evals)}")
Пример #10
0
def test_verbose_int(verbose):
    expected_line_count = {5: 3, False: 0, True: 10}
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    tmpfile = 'test_data_dumps'

    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == expected_line_count[verbose])

    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == expected_line_count[verbose])

    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
Пример #11
0
    def generate_submit(self, num_boost_round=None, from_model_saved=False):
        assert num_boost_round is not None

        if not from_model_saved:
            dtrain = self.get_train_set(as_cgb_pool=True)

            booster = cgb.train(
                dtrain=dtrain,
                params=self.params_best_fit,
                num_boost_round=num_boost_round)

            self.save_model(booster)

        else:
            booster = cgb.CatBoost(model_file=from_model_saved)

        dftest = self.get_test_set(as_cgb_pool=True)

        with Timer("Predicting"):
            probas = booster.predict(dftest, prediction_type="Probability")
            dfpred = pd.DataFrame(probas)[[1]]  # Get proba classe one
            dfpred = dfpred.rename(columns={1: 'target'})

        now = pd.Timestamp.now(tz='CET').strftime("%d-%Hh-%Mm")

        fpath = RESULT_DIR / "catboost_submit_{}.csv".format(now)

        with Timer('Storing in {}'.format(fpath)):
            dfpred.to_csv(fpath, index=False)
Пример #12
0
def train_model(
    df_train,
    df_valid,
    model_params,
    general_params,
):
    train_pool = catboost.Pool(df_train["text"].values,
                               label=df_train["label"].values,
                               text_features=[0])
    valid_pool = catboost.Pool(df_valid["text"].values,
                               label=df_valid["label"].values,
                               text_features=[0])

    model_params = copy.deepcopy(model_params)
    model_params.update({"train_dir": general_params["logdir"]})

    model = catboost.train(
        pool=train_pool,
        eval_set=valid_pool,
        params=model_params,
        verbose=False,
        plot=False,
    )

    model.save_model(os.path.join(general_params["logdir"], "model.cbm"))
Пример #13
0
def test_shap_complex_ctr():
    pool = Pool([[0, 0, 0], [0, 1, 0], [1, 0, 1], [1, 1, 2]], [0, 0, 5, 8], cat_features=[0, 1, 2])
    model = train(pool, {'random_seed': 12302113, 'iterations': 100})
    shap_values = model.get_feature_importance(fstr_type=EFstrType.ShapValues, data=pool)
    predictions = model.predict(pool)
    assert(len(predictions) == len(shap_values))
    for pred_idx in range(len(predictions)):
        assert(abs(sum(shap_values[pred_idx]) - predictions[pred_idx]) < 1e-9)

    np.savetxt(FIMP_TXT_PATH, shap_values)
    return local_canonical_file(FIMP_TXT_PATH)
Пример #14
0
    def train(self, train_df, eval_df, params=None, use_best_eval=True):
        self.best_round = None
        dtrain = cat.Pool(data=train_df.drop(columns=[self.label]),
                          label=train_df[self.label])
        deval = cat.Pool(data=eval_df.drop(columns=[self.label]),
                         label=eval_df[self.label])
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)
        num_round = use_params.pop('num_round')
        if use_best_eval:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = cat.train(params=use_params,
                                     pool=dtrain,
                                     evals=deval,
                                     num_boost_round=num_round)

                output = buf.getvalue().split("\n")
            min_error = np.inf
            min_index = 0

            for idx in range(1, num_round + 1):
                if len(output[idx].split("\t")) == 6:
                    temp = 1 - float(output[idx].split("\t")[2].split(":")[1])
                    if min_error > temp:
                        min_error = temp
                        min_index = int(output[idx].split("\t")[0][:-1])
            print("The minimum is attained in round %d" % (min_index + 1))
            self.best_round = min_index + 1
            return output
        else:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = cat.train(params=use_params,
                                     pool=dtrain,
                                     evals=deval,
                                     num_boost_round=num_round)
                output = buf.getvalue().split("\n")
            self.best_round = num_round
            return output
Пример #15
0
    def train(self, X_train, X_valid, y_train, y_valid):
        """ ccc """

        # Convert data to CatBoost Pool format.
        ds_train = ctb.Pool(X_train, y_train)
        ds_valid = ctb.Pool(X_valid, y_valid)

        # Set context dependent CatBoost parameters.
        self.params['dtrain'] = ds_train
        self.params['eval_set'] = ds_valid

        # Train using parameters sent by the user.
        return ctb.train(**self.params)
Пример #16
0
    def cv_helper(one_hot_max_size,\
                  depth,\
                  l2_leaf_reg,\
                  random_strength,\
                  bagging_temperature):
        
        # entire date for evaluate clf training performance
        all_data = Pool(data=X_train[features],\
                        label=y_train,\
                        cat_features=categorical_features)
        # validation RMSE
        RMSE = []
        
        for train_idxs, val_idxs in folds.split(X_train.values, y_train.values):
            
            # training set
            train_data = Pool(data=X_train.iloc[train_idxs][features],\
                              label=y_train.iloc[train_idxs],\
                              cat_features=categorical_features)

            # validation set
            val_data = Pool(data=X_train.iloc[val_idxs][features],\
                            label=y_train.iloc[val_idxs],\
                            cat_features=categorical_features)
            # hyperparameters
            params = {
                'eval_metric': 'RMSE',
                'use_best_model': True,
                'loss_function': 'RMSE',
                'learning_rate': 0.02,
                'early_stopping_rounds': 400,
                'border_count': 254,
                'task_type': 'GPU',
                'one_hot_max_size': int(one_hot_max_size),
                'depth': int(depth),
                'l2_leaf_reg': l2_leaf_reg,
                'random_strength': random_strength,
                'bagging_temperature': bagging_temperature
            }
            
            # classifier
            clf = train(pool=train_data,\
                        params=params,\
                        verbose=200,\
                        iterations=10000,\
                        eval_set=all_data)
            
            # add current fold RMSE on all_data
            RMSE.append(clf.best_score_['validation_0']['RMSE'])
            
        return -np.mean(np.array(RMSE))
Пример #17
0
    def validate(self, save_model=True, **kwargs):
        dtrain, dtest = self.get_train_valid_set(as_cgb_pool=True)
        watchlist = [dtrain, dtest]

        booster = cgb.train(
            dtrain=dtrain,
            params=self.params_best_fit,
            eval_set=watchlist,
            **kwargs,
        )

        if save_model:
            self.save_model(booster)
        return booster
Пример #18
0
    def _train_model(self, data):
        print(self.params)
        dtrain = cb.Pool(data.X_train, data.y_train)
        if data.task == 'Ranking':
            dtrain.set_group_id(data.groups)

        start = time.time()
        self.model = cb.train(
            pool=dtrain,
            params=self.params,
        )
        elapsed = time.time() - start

        return elapsed
Пример #19
0
def cgb_fit(config, X_train, y_train):
    """模型(交叉验证)训练,并返回最优迭代次数和最优的结果。
    Args:
        config: xgb 模型参数 {params, max_round, cv_folds, early_stop_round, seed, save_model_path}
        X_train:array like, shape = n_sample * n_feature
        y_train:  shape = n_sample * 1

    Returns:
        best_model: 训练好的最优模型
        best_auc: float, 在测试集上面的 AUC 值。
        best_round: int, 最优迭代次数。
    """
    params = config.params
    max_round = config.max_round
    cv_folds = config.cv_folds
    seed = config.seed
    save_model_path = config.save_model_path
    if cv_folds is not None:
        dtrain = cgb.Pool(X_train, label=y_train)
        cv_result = cgb.cv(dtrain, params, num_boost_round=max_round, nfold=cv_folds, seed=seed, logging_level='Verbose')
        # 最优模型,最优迭代次数
        auc_test_avg = cv_result['AUC_test_avg']
        best_round = np.argmax(auc_test_avg)
        best_auc = np.max(auc_test_avg)  # 最好的 auc 值
        best_model = cgb.train(dtrain, params, num_boost_round=best_round)
    else:
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=100)
        dtrain = cgb.Pool(X_train, label=y_train)
        dvalid = cgb.Pool(X_valid, label=y_valid)
        best_model = cgb.train(params, dtrain, num_boost_round=max_round, eval_set=dvalid)
        best_round = best_model.best_iteration
        best_auc = best_model.best_score
        cv_result = None
    if save_model_path:
        check_path(save_model_path)
        pickle.dump(best_model, open(save_model_path, 'wb'))
    return best_model, best_auc, best_round, cv_result
Пример #20
0
    def fit(self,
            df,
            target,
            evals=None,
            early_stopping_rounds=None,
            verbose_eval=None,
            plot=False,
            **kwargs):
        '''Fit the CatBoostModel model given a DataFrame.
        This method accepts all key word arguments for the catboost.train method.

        :param df: A vaex DataFrame containing the training features.
        :param target: The column name of the target variable.
        :param evals: A list of DataFrames to be evaluated during training.
        This allows user to watch performance on the validation sets.
        :param int early_stopping_rounds: Activates early stopping.
        :param bool verbose_eval: Requires at least one item in *evals*.
        If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage.
        :param bool plot: if True, display an interactive widget in the Jupyter
        notebook of how the train and validation sets score on each boosting iteration.
        '''
        # Ensure strings
        target = vaex.dataframe._ensure_string_from_expression(target)

        data = df[self.features].values
        target_data = df[target].values
        dtrain = catboost.Pool(data=data,
                               label=target_data,
                               **self.pool_params)
        if evals is not None:
            for i, item in enumerate(evals):
                data = item[self.features].values
                target_data = item[target].values
                evals[i] = catboost.Pool(data=data,
                                         label=target_data,
                                         **self.pool_params)

        # This does the actual training/fitting of the catboost model
        self.booster = catboost.train(
            params=self.params,
            dtrain=dtrain,
            num_boost_round=self.num_boost_round,
            evals=evals,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=verbose_eval,
            plot=plot,
            **kwargs)
Пример #21
0
 def test(self, X_test, y_test):
     """This function evaluates the model on paramters and estimators
     Parameters
     ----------
     x_test: test set; y_test: test label"""
     self.cat = cb.train(params=self.params, pool=self.train_set)
     self.predictions = self.cat.predict(X_test,
                                         prediction_type="Probability")
     self.predictions = self.predictions[:, 1]
     self.y_test = y_test
     self.X_test = X_test
     print(
         "Model will be trained with best parameters obtained from your choice of optimization model ... \n\n\n"
     )
     print(
         "Model trained with {} estimators on the following parameters: \n{}"
         .format(self.estimator, self.params))
Пример #22
0
    def cat_stacking(
        self,
        target=None,
    ):

        X = self.train.drop(self.cols_to_drop, axis=1)
        y = self.train[target].values

        cat_models = []
        predict_train = []
        predict_test = []
        predict_val = []

        for fold, (train_ids, val_ids) in enumerate(self.folds.split(X, y)):
            dtrain = cat.Pool(X.iloc[train_ids], y[train_ids])
            dval = cat.Pool(X.iloc[val_ids], y[val_ids])
            model = cat.train(params=self.cat_params,
                              dtrain=dtrain,
                              eval_set=dval,
                              verbose=200,
                              early_stopping_rounds=100)

            pred = model.predict((cat.Pool(X.iloc[val_ids])))
            predict_train = np.concatenate([pred, predict_train])
            if self.mode == 'val':

                predict_val.append(
                    model.predict(
                        cat.Pool(self.val.drop(self.cols_to_drop, axis=1))))

            if self.mode == 'test':
                predict_test.append(
                    model.predict(
                        cat.Pool(self.test.drop(self.drop_columns_test,
                                                axis=1))))

            cat_models.append(model)
        clear_output()
        if self.mode == 'val':
            return predict_train, np.asarray(predict_val).mean(axis=0)
        if self.mode == 'test':
            return predict_train, np.asarray(predict_test).mean(axis=0)
Пример #23
0
    def train_for_threshold(self, features, target='label', num=35000):
        train_df = self.train_[self.train_.ID < num]
        val_df = self.train_[self.train_.ID >= num]

        X_train, y_train = train_df[features].values, train_df[target].values.astype('uint8')
        X_eval, y_eval = val_df[features].values, val_df[target].values.astype('uint8')

        cat_train = Pool(X_train, y_train)
        cat_eval = Pool(X_eval, y_eval)

        cat_model = catboost.train(cat_train, self.params, iterations=10000,
                              eval_set=cat_eval,
                              early_stopping_rounds=200,
                              verbose=500)
        y_pred = cat_model.predict(cat_eval, prediction_type='Probability')[:,1]
        ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词
        gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred)
        ## 获取搜索得到的阈值结果
        self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba)

        return self.threshold
Пример #24
0
    def train_and_predict(self, features, target='label', save=True):
        self.fe = features
        X_train, y_train = self.train_[features].values, self.train_[target].values.astype('uint8')
        X_test = self.test_[self.fe].values

        cat_all = Pool(X_train, y_train)

        model = catboost.train(
            cat_all,
            self.params,
            iterations=10000,
            early_stopping_rounds=200,
            verbose=1000
        )

        self.model = model

        if save:
            save_pkl(model, os.path.join(self.opt['model_train'], 'cat.pkl'))

        self.pred = model.predict(X_test, prediction_type='Probability')[:, 1]

        return self.pred
Пример #25
0
def test_verbose_int():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    tmpfile = 'test_data_dumps'

    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=5)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 2)
    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=False)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)
    with LogStdout(open(tmpfile, 'w')):
        cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=True)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 10)

    log_files = []
    for i in range(3):
        log_files.append(JSON_LOG_PATH[:-5]+str(i)+JSON_LOG_PATH[-5:])

    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[0]}, verbose=5)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 2)
    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[1]}, verbose=False)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 0)
    with LogStdout(open(tmpfile, 'w')):
        train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[2]}, verbose=True)
    with open(tmpfile, 'r') as output:
        assert(sum(1 for line in output) == 10)

    canonical_files = []

    for log_file in log_files:
        canonical_files.append(local_canonical_file(remove_time_from_json(log_file)))
    return canonical_files
print('All Iterations')
display(gs_summary)
print('Best parameters: ')
best_cv = gs_results.loc[gs_results['result'].idxmax()]
display(best_cv)

profile.End()
print('Time elapsed: %s mins' % str(profile.ElapsedMinutes))

# Save CV process
gs_summary.to_csv('../AllData_v2_CATBOOST_GS.csv')

# Generate model by best iteration
model = catb.train(params=params,
                   pool=cat_train,
                   num_boost_round=best_cv[1],
                   logging_level="Verbose")

# Save model for possible coded ensemble
model.save_model('../AllData_v2_CATBOOST_Model')

# Generate train prediction for future ensemble
train_preds = model.predict(train_X, prediction_type="Probability")
data = pd.read_csv('../input/application_train.csv')

data['preds'] = train_preds[:, 1]
data = data[['SK_ID_CURR', 'preds']]
data.to_csv('../AllData_v2_CATBOOST_TrainPreds.csv', index=False)

# Generate sub prediction for Kaggle
sub_preds = model.predict(test_X, prediction_type="Probability")
Пример #27
0
def blending_test(train_x,train_y,val_x,val_y,test_x,weights=(0.38,0.18,0.14,0.1,0.20)):
    import random
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cat
    from sklearn.ensemble import RandomForestClassifier as rf
    from sklearn.linear_model import LogisticRegression as lr
    import gc
    res = 0
    lgb_params = {
            # "max_bin": 512,
            "learning_rate": 0.01,
            "boosting_type": "gbdt",
            "objective": "binary",
            "metric": "auc",
            "num_leaves": 31,
            "max_depth": -1,
            "verbose": 200,
            "subsample": 0.8,
            "colsample_bytree": 0.9,
            "subsample_freq": 1,
            "reg_alpha": 0,
            "min_child_weight": 25,
            "random_state": random.randint(1,1000),
            "reg_lambda": 1,
            "n_jobs": -1,
        }
    d_train = lgb.Dataset(train_x, label=train_y)
    d_test = lgb.Dataset(val_x, label=val_y)
    clf_lgb = lgb.train(lgb_params, d_train, 3000, valid_sets=[d_train, d_test], early_stopping_rounds=100,
                        verbose_eval=200)
    # temp_score_val = clf_lgb.best_score["valid_1"]["auc"]
    # temp_score_train = clf_lgb.best_score["training"]["auc"]
    # weight = 2*(temp_score_train*temp_score_val)/(temp_score_train+temp_score_val)
    temp_predict_lgb = clf_lgb.predict(test_x, num_iteration=clf_lgb.best_iteration)
    res+=temp_predict_lgb*weights[0]
    # weight = (temp_score_train + 2 * temp_score_val)/3
    # res_temp_lgb = temp_predict_lgb * temp_predict_lgb * weight
    del d_test,d_train,clf_lgb,lgb_params

    cat_params = {
        "verbose": 200,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "iterations": 1000,
        "random_seed": random.randint(1,1000),
        "learning_rate": 0.03,
        "depth": 6,
        "thread_count": 16,
        "use_best_model": True,
        "od_type": 'Iter',
        "od_wait": 30,
    }
    clf_cat = cat.train(pool=(train_x, train_y),params=cat_params,eval_set=[(train_x, train_y),(val_x, val_y)],verbose_eval=200)
    # temp_score_val = clf_cat.get_test_eval()[1]
    # temp_score_train = clf_cat.get_test_eval()[0]
    temp_predict_cat = clf_cat.predict(test_x,prediction_type="Probability")[:,1]
    res+=temp_predict_cat*weights[1]
    # weight = (temp_score_train + 2 * temp_score_val)/3
    # res_temp_cat = temp_predict_cat * temp_predict_cat * weight
    del clf_cat,cat_params
    gc.collect()
    # xgb_params = {
    #     "objective": "binary:logistic",
    #     "eval_metric": "auc",
    #     "n_estimators": 3000,
    #     "booster":"gbtree",
    #     "learning_rate": 0.05,
    #     "max_depth": 6,
    #     "n_jobs": -1,
    #     "colsample_bytree": 0.9,
    #     "subsample": 0.8,
    #     "min_child_weight": 1,
    #     "reg_lambda": 1,
    # }
    # clf_xgb = xgb.XGBClassifier(**xgb_params)
    # clf_xgb.fit(X=train_x, y=train_y, eval_set=[(train_x,train_y),(val_x,val_y)],eval_metric="auc",verbose=False,early_stopping_rounds=100)
    # temp_score_val = clf_xgb.best_score["valid_1"]["auc"]
    # temp_score_train = clf_xgb.best_score["training"]["auc"]
    # # weight = 2*(temp_score_train*temp_score_val)/(temp_score_train+temp_score_val)
    # temp_predict_xgb = clf_xgb.predict(test_x,ntree_limit=clf_xgb.best_ntree_limit)
    # weight = (temp_score_train + 2 * temp_score_val)/3
    # res_temp_xgb = temp_predict_xgb * temp_predict_xgb * weight
    # del clf_xgb,temp_score_val,temp_score_train,temp_predict_xgb,xgb_params
    rf_ = rf(
        n_estimators=3000,
        criterion='gini',
        max_depth=6,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=0.6,
        min_impurity_decrease=0.00001,
        n_jobs=-1,
        verbose=200,
        random_state=random.randint(1,1000),
        warm_start=True,
    )
    clf_rf = stop_early_wrapper(rf_,train_x,train_y,val_x, val_y,n_min_iterations=100,scale=1.0001,eval_metric="auc")
    temp_predict_rf = clf_rf.predict_proba(test_x)[:,1]
    res+=temp_predict_rf*weights[2]
    # weight = clf_rf.best_score
    # res_temp_rf = temp_predict_rf * temp_predict_rf * weight
    del clf_rf,rf_,
    gc.collect()
    for f in test_x.columns:
        train_x[f] = (train_x[f] - train_x[f].min()) / (train_x[f].max() - train_x[f].min())
        val_x[f] = (val_x[f] - val_x[f].min()) / (val_x[f].max() - val_x[f].min())
        test_x[f] = (test_x[f] - test_x[f].min()) / (test_x[f].max() - test_x[f].min())
    clf_lr = lr(penalty='l2', dual=False, tol=0.00001, C=0.06, random_state=random.randint(1,1000),

                            solver='liblinear', max_iter=2000,

                            verbose=200)
    temp_train = pd.concat([train_x,val_x],axis=0)
    temp_y = pd.concat([train_y,val_y],axis=0)
    clf_lr.fit(temp_train,temp_y)
    print("validation score of lr",clf_lr.score(val_x,val_y))
    # val_y_hat = clf_lr.predict_proba(test_x)[:, 1]
    # from sklearn.metrics import roc_auc_score
    # weight = roc_auc_score(val_y, val_y_hat)
    temp_predict_lr = clf_rf.predict_proba(test_x)[:, 1]
    res+=temp_predict_lr*weights[3]
    # res_temp_lr = temp_predict_lr * temp_predict_lr * weight
    del clf_lr,temp_train,temp_y
    gc.collect()

    from keras.callbacks import ModelCheckpoint
    from keras.callbacks import EarlyStopping
    clf_nn = KerasClassifier_wrapper(train_x.shape[1])
    model_path = "../input/keras_model.h5"
    callbacks = [
        EarlyStopping(
            monitor='val_auc',
            patience=20,
            mode='max',
            verbose=100),
        ModelCheckpoint(
            model_path,
            monitor='val_auc',
            save_best_only=True,
            mode='max',
            verbose=100)
    ]
    # fit estimator
    history = clf_nn.fit(
        train_x,
        train_y,
        epochs=1000,
        batch_size=1024,
        validation_data=(val_x, val_y),
        verbose=1,
        callbacks=callbacks,
        shuffle=True
    )
    print(history.history.keys())
    import matplotlib.pyplot as plt
    # summarize history for R^2
    fig_acc = plt.figure(figsize=(10, 10))
    plt.plot(history.history['auc'])
    plt.plot(history.history['val_auc'])
    plt.title('model auc')
    plt.ylabel('auc')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    fig_acc.savefig("model_auc.png")

    # summarize history for loss
    fig_loss = plt.figure(figsize=(10, 10))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    fig_loss.savefig("model_loss.png")
    temp_predict_nn = clf_nn.predict_proba(test_x)[:,1]
    res+=temp_predict_nn*weights[4]
    del history,clf_nn,train_x,train_y,val_x,val_y,test_x
    gc.collect()
    return  res
Пример #28
0
    def fit(self,
            df,
            evals=None,
            early_stopping_rounds=None,
            verbose_eval=None,
            plot=False,
            progress=None,
            **kwargs):
        '''Fit the CatBoostModel model given a DataFrame.
        This method accepts all key word arguments for the catboost.train method.

        :param df: A vaex DataFrame containing the features and target on which to train the model.
        :param evals: A list of DataFrames to be evaluated during training.
            This allows user to watch performance on the validation sets.
        :param int early_stopping_rounds: Activates early stopping.
        :param bool verbose_eval: Requires at least one item in *evals*.
            If *verbose_eval* is True then the evaluation metric on the validation set is printed at each boosting stage.
        :param bool plot: if True, display an interactive widget in the Jupyter
            notebook of how the train and validation sets score on each boosting iteration.
        :param progress: If True display a progressbar when the training is done in batches.
        '''
        self.pool_params['feature_names'] = self.features
        if evals is not None:
            for i, item in enumerate(evals):
                data = item[self.features].values
                target_data = item[self.target].to_numpy()
                evals[i] = catboost.Pool(data=data,
                                         label=target_data,
                                         **self.pool_params)

        # This does the actual training/fitting of the catboost model
        if self.batch_size is None:
            data = df[self.features].values
            target_data = df[self.target].to_numpy()
            dtrain = catboost.Pool(data=data,
                                   label=target_data,
                                   **self.pool_params)
            model = catboost.train(params=self.params,
                                   dtrain=dtrain,
                                   num_boost_round=self.num_boost_round,
                                   evals=evals,
                                   early_stopping_rounds=early_stopping_rounds,
                                   verbose_eval=verbose_eval,
                                   plot=plot,
                                   **kwargs)
            self.booster = model
            self.evals_result_ = [model.evals_result_]
            self.feature_importances_ = list(model.feature_importances_)
        else:
            models = []

            # Set up progressbar
            n_samples = len(df)
            progressbar = vaex.utils.progressbars(progress)

            column_names = self.features + [self.target]
            iterator = df[column_names].to_pandas_df(
                chunk_size=self.batch_size)
            for i1, i2, chunk in iterator:
                progressbar(i1 / n_samples)
                data = chunk[self.features].values
                target_data = chunk[self.target].values
                dtrain = catboost.Pool(data=data,
                                       label=target_data,
                                       **self.pool_params)
                model = catboost.train(
                    params=self.params,
                    dtrain=dtrain,
                    num_boost_round=self.num_boost_round,
                    evals=evals,
                    early_stopping_rounds=early_stopping_rounds,
                    verbose_eval=verbose_eval,
                    plot=plot,
                    **kwargs)
                self.evals_result_.append(model.evals_result_)
                models.append(model)
            progressbar(1.0)

            # Weights are key when summing models
            if len(self.batch_weights) == 0:
                batch_weights = [1 / len(models)] * len(models)
            elif self.batch_weights is not None and len(
                    self.batch_weights) != len(models):
                raise ValueError(
                    "'batch_weights' must be te same length as the number of models."
                )
            else:
                batch_weights = self.batch_weights

            # Sum the models
            self.booster = catboost.sum_models(
                models,
                weights=batch_weights,
                ctr_merge_policy=self.ctr_merge_policy)
Пример #29
0
    'max_depth': 10,
    'num_rounds': 200,
}  #score :0.62228 @ 0.01 , split = 0.9, random_state=7
#score :0.62213 @ 0.01 , split = 0.8, random_state=7
#score :0.62113 @ 0.01 , split = 0.7, random_state=7
model_xgb = xgb.train(params_xgb,
                      d_train_xgb,
                      105,
                      watchlist_xgb,
                      early_stopping_rounds=20,
                      maximize=True,
                      verbose_eval=10)

model_lgb = lgb.train(params_lgb, d_train_lgb, 100, valid_sets_lgb)

model_cat = cat.train(pool, params=params_cat, logging_level='Verbose')

p_train_xgb = model_xgb.predict(d_valid_xgb)
p_train_lgb = model_lgb.predict(X_test)
p_train_cat = model_cat.predict(X_test)

p_test_xgb = model_xgb.predict(d_test_xgb)
p_test_lgb = model_lgb.predict(real_test_data)
p_test_cat = model_cat.predict(real_test_data)

final_train = p_train_xgb.reshape(-1, 1)
final_test = p_test_xgb.reshape(-1, 1)
final_train = np.concatenate((final_train, p_train_lgb.reshape(-1, 1)), axis=1)
final_test = np.concatenate((final_test, p_test_lgb.reshape(-1, 1)), axis=1)
final_train = np.concatenate((final_train, p_train_cat.reshape(-1, 1)), axis=1)
final_test = np.concatenate((final_test, p_test_cat.reshape(-1, 1)), axis=1)
def main(options):
    print("load train test datasets")
    train_all, y_train_all, id_train, test, id_test = pre_train()

    cat_params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': options.learning_rate,
        'l2_leaf_reg': options.l2_leaf_reg,  # L2 regularization coefficient.
        'subsample': options.subsample,
        'depth': options.depth,  # Depth of the tree
        'border_count': 255,  # The number of splits for numerical features
        'thread_count': 6,
        'train_dir': 'catboost_train_logs',
        'bootstrap_type': 'Bernoulli',
        'use_best_model': True,
        'random_seed': options.seed
    }

    roof_flod = options.roof_flod
    kf = StratifiedKFold(n_splits=roof_flod,
                         shuffle=True,
                         random_state=options.seed)

    pred_train_full = np.zeros(train_all.shape[0])
    pred_test_full = 0
    cv_scores = []

    predict_feature = 'catboost_predict_roof_fold{}_lr{}_l2_leaf_reg{}_subsample{}_depth{}_seed{}'.format(
        options.roof_flod, options.learning_rate, options.l2_leaf_reg,
        options.subsample, options.depth, options.seed)

    print('params info:', predict_feature)

    for i, (dev_index,
            val_index) in enumerate(kf.split(train_all, y_train_all)):
        print(
            '========== perform fold {}, train size: {}, validate size: {} =========='
            .format(i, len(dev_index), len(val_index)))
        train_x, val_x = train_all.ix[dev_index], train_all.ix[val_index]
        train_y, val_y = y_train_all[dev_index], y_train_all[val_index]

        model = cat.train(pool=Pool(train_x, train_y),
                          params=cat_params,
                          iterations=460,
                          eval_set=(val_x, val_y),
                          verbose=False)

        # predict validate
        predict_valid = model.predict(val_x.values,
                                      prediction_type='Probability')[:, 1]
        valid_auc = evaluate_score(predict_valid, val_y)
        # predict test
        predict_test = model.predict(test.values,
                                     prediction_type='Probability')[:, 1]

        print('valid_auc = {}'.format(valid_auc))
        cv_scores.append(valid_auc)

        # run-out-of-fold predict
        pred_train_full[val_index] = predict_valid
        pred_test_full += predict_test

    mean_cv_scores = np.mean(cv_scores)
    print('Mean cv auc:', mean_cv_scores)

    print("saving train predictions for ensemble")
    train_pred_df = pd.DataFrame({'userid': id_train})
    train_pred_df[predict_feature] = pred_train_full
    train_pred_df.to_csv(
        "./ensemble/train/catboost/hl_cat_roof{}_predict_train_cv{}_{}.csv".
        format(roof_flod, mean_cv_scores, predict_feature),
        index=False,
        columns=['userid', predict_feature])

    print("saving test predictions for ensemble")
    pred_test_full = pred_test_full / float(roof_flod)
    test_pred_df = pd.DataFrame({'userid': id_test})
    test_pred_df[predict_feature] = pred_test_full
    test_pred_df.to_csv(
        "./ensemble/test/catboost/hl_cat_roof{}_predict_test_cv{}_{}.csv".
        format(roof_flod, mean_cv_scores, predict_feature),
        index=False,
        columns=['userid', predict_feature])
Пример #31
0
def get_feature_importance(data, target, clf='lightgbm', shuffle=False):
    '''
    Parameters
    ----------
    data: input dataset, type of dataframe
    
    target: input target dataset, type of series
    
    clf: the name of model want to use, type of string
    
    shuffle: whether to shuffle target dataset (for getting null importance)
    
    Return
    ------
    importance_df: importance of each features, type of dataframe, shape(n_feature, n_importance)
    '''
    
    # feature list
    train_features = [feature for feature in data.columns.values if feature not in ['target', 'card_id', 'first_active_month']]
    categorical_features = [feature for feature in train_features if 'feature_' in feature]
    
    # shuffle the data
    y = target.copy().sample(frac=1.0) if shuffle else target.copy()
    
    # using lightgbm
    if clf == 'lightgbm':
        import lightgbm as lgb
        import pandas as pd
    
        # construct training date
        train_data = lgb.Dataset(data=data[train_features],\
                                 label=y,\
                                 free_raw_data=False)
    
        # model hyperparameters
        lgb_params = {
            'num_leaves': 129,
            'min_data_in_leaf': 148,
            'objective': 'regression',
            'max_depth': 9,
            'learning_rate': 0.005,
            'min_child_samples': 24,
            'boosting': 'gbdt',
            'feature_fraction': 0.7202,
            'bagging_freq': 1,
            'bagging_fraction': 0.8125,
            'bagging_seed': 11,
            'metric': 'rmse',
            'lambda_l1': 0.3468,
            'random_state': 133,
            'verbosity': -1
        }
    
        # training the model
        clf_lgb = lgb.train(params=lgb_params,\
                            train_set=train_data,\
                            num_boost_round=850)
        
        # calculate importance
        importance_df = pd.DataFrame()
        importance_df['feature'] = list(train_features)
        importance_df['importance_gain'] =\
        clf_lgb.feature_importance(importance_type='gain')
        importance_df['importance_split'] =\
        clf_lgb.feature_importance(importance_type='split')

        return importance_df
    
    if clf == 'catboost':
        from catboost import train, Pool, EFstrType
        import pandas as pd
        
        # construct training data
        train_data = Pool(data=data[train_features],\
                          label=y)
        
        # model hyperparameters
        cat_params = {
            'loss_function': 'RMSE',
            'learning_rate': 0.02,
            'early_stopping_rounds': 400,
            'border_count': 254,
            'task_type': 'GPU',
            'one_hot_max_size': 6,
            'depth': 11,
            'l2_leaf_reg': 1.0,
            'random_strength': 1.9574,
            'bagging_temperature': 20.9049
        }
        
        # training the model
        clf_cat = train(pool=train_data,\
                        params=cat_params,\
                        verbose=False,\
                        iterations=1000)
        
        # calculate feature importance
        importance_df = pd.DataFrame()
        importance_df['feature'] = list(train_features)
        importance_df['PredictionValuesChange'] =\
        clf_cat.get_feature_importance(type='PredictionValuesChange')
        
        return importance_df