Exemplo n.º 1
0
 def fit(self,
         train_df,
         valid_df=None,
         train_dir='.',
         niter=10000,
         seed=123):
     if self.model is None:
         params = {
             'loss_function': 'RMSE',
             'task_type': 'GPU',
             'iterations': niter,
             'verbose': True,
             'train_dir': train_dir,
             'random_seed': seed
         }
         self.model = catboost.CatBoost(params)
         init_model = None
     else:
         init_model = self.model
     train_features, train_labels = get_feature_label(train_df)
     train_pool = catboost.Pool(data=train_features, label=train_labels)
     if valid_df is not None:
         valid_features, valid_labels = get_feature_label(valid_df)
         dev_pool = catboost.Pool(data=valid_features, label=valid_labels)
     else:
         dev_pool = None
     self.model.fit(train_pool, eval_set=dev_pool, init_model=init_model)
Exemplo n.º 2
0
def cat_cv(train, test, params, fit_params, cat_features, feature_names, nfold,
           seed):
    train.Pred = pd.DataFrame({
        'id': train['样本id'],
        'true': train['收率'],
        'pred': np.zeros(len(train))
    })
    test.Pred = pd.DataFrame({'id': test['样本id'], 'pred': np.zeros(len(test))})
    kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)
    cat_tst = cat.Pool(data=test[feature_names],
                       cat_features=cat_features,
                       feature_names=feature_names)
    for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train['收率'])):
        print(f'\nFold_{fold_id} Training ================================\n')
        cat_trn = cat.Pool(data=train.iloc[trn_idx][feature_names],
                           label=train.iloc[trn_idx]['收率'],
                           cat_features=cat_features,
                           feature_names=feature_names)
        cat_val = cat.Pool(data=train.iloc[val_idx][feature_names],
                           label=train.iloc[val_idx]['收率'],
                           cat_features=cat_features,
                           feature_names=feature_names)
        cat.train(params=params, pool=cat_trn, **fit_params, eval_set=cat_val)
        val_pred = cat.predict(train.iloc[val_idx][feature_names])
        train.Pred.loc[val_idx, 'pred'] = val_pred
        print(f'Fold_{fold_id}', mse(train.iloc[val_idx]['收率'], val_pred))
        test.Pred['pred'] += cat.predict(cat_tst) / nfold
    print('\n\nCV LOSS:', mse(train.Pred['true'], train.Pred['pred']))
    return test.Pred
Exemplo n.º 3
0
def train_and_predict_catboost():
    (train_X, train_y), (test_X, test_y) = get_data(scaler='none',
                                                    one_hot=False,
                                                    convt_cat=False)

    train_y = train_y.flatten()
    test_y = test_y.flatten()

    cat_features = [1, 2, 4, 5, 6]
    train_pool = catboost.Pool(data=train_X,
                               label=train_y,
                               cat_features=cat_features)
    test_pool = catboost.Pool(data=test_X,
                              label=test_y,
                              cat_features=cat_features)

    model = catboost.CatBoostClassifier(loss_function='MultiClass',
                                        depth=None,
                                        random_seed=42,
                                        cat_features=[1, 2, 4, 5, 6],
                                        silent=False)
    model.fit(train_pool)
    y_pred = model.predict(test_pool)
    y_true = test_y

    print(y_pred.shape)
    print(np.unique(y_pred))
    helpers.evaluate_clf(y_true, y_pred)

    pred_for_train = model.predict(train_X)
    pred_for_test = model.predict(test_X)
    return pred_for_train, pred_for_test
Exemplo n.º 4
0
    def fit(self, data, clf=None):
        setseed(self.seed)
        train_df = data[data['visitId'].\
                        apply(lambda x: x.date()) >= datetime.date(2016,9,30)]
        val_df = data[data['visitId'].\
                      apply(lambda x: x.date()) < datetime.date(2016,9,30)]
        if clf:
            train_X, train_y = \
                train_df[self.features_name].values,\
                train_df['validRevenue'].values\

            val_X, val_y = \
                val_df[self.features_name].values,\
                val_df['validRevenue'].values
        else:
            train_X, train_y = \
                train_df[self.features_name].values,\
                train_df['totals_transactionRevenue'].values

            val_X, val_y = \
                val_df[self.features_name].values,\
                val_df['totals_transactionRevenue'].values


#        x_train, x_eval, y_train, y_eval = \
#                train_test_split(df_x, df_y,
#                             test_size = self.test_ratio,
#                             random_state = self.seed)

        cat_train = cat.Pool(data=train_X,
                             label=train_y,
                             feature_names=self.features_name,
                             cat_features=self.categorical_feature)

        cat_eval = cat.Pool(data=val_X,
                            label=val_y,
                            feature_names=self.features_name,
                            cat_features=self.categorical_feature)

        model_param = self.params['params_clf'] if clf else self.params[
            'params_reg']

        self.estimator = cat.train(
            params=model_param,
            pool=cat_train,
            eval_set=cat_eval,
            #                num_boost_round = self.num_boost_round,
            #                learning_rate = self.params['learning_rate'],
            #                max_depth = self.params['max_depth'],
            #                l2_leaf_reg = self.params['l2_leaf_reg'],
            #                rsm = self.params['colsample_ratio'],
            #                subsample = self.params['subsample_ratio'],
            #                class_weights = self.params['class_weights'],
            #                loss_function = self.loglikeloss,
            #                custom_loss = self.loglikeloss,
            #                custom_metric = self.roc_auc_error,
            #                eval_metric = self.roc_auc_error
        )

        return self
Exemplo n.º 5
0
    def __init__(self, data, task, metric, use_gpu):
        Learner.__init__(self)

        params = {
            'devices': [0],
            'logging_level': 'Verbose',
            'use_best_model': False,
            'bootstrap_type': 'Bernoulli',
            'random_seed': RANDOM_SEED
        }

        if use_gpu:
            params['task_type'] = 'GPU'

        if task == 'regression':
            params['loss_function'] = 'RMSE'
        elif task == 'binclass':
            params['loss_function'] = 'Logloss'
        elif task == 'multiclass':
            params['loss_function'] = 'MultiClass'

        if metric == 'Accuracy':
            params['custom_metric'] = 'Accuracy'

        self.train = cat.Pool(data.X_train, data.y_train)
        self.test = cat.Pool(data.X_test, data.y_test)

        self.default_params = params
Exemplo n.º 6
0
def train_model(
    df_train,
    df_valid,
    model_params,
    general_params,
):
    train_pool = catboost.Pool(df_train["text"].values,
                               label=df_train["label"].values,
                               text_features=[0])
    valid_pool = catboost.Pool(df_valid["text"].values,
                               label=df_valid["label"].values,
                               text_features=[0])

    model_params = copy.deepcopy(model_params)
    model_params.update({"train_dir": general_params["logdir"]})

    model = catboost.train(
        pool=train_pool,
        eval_set=valid_pool,
        params=model_params,
        verbose=False,
        plot=False,
    )

    model.save_model(os.path.join(general_params["logdir"], "model.cbm"))
Exemplo n.º 7
0
    def _train(self, params = None, predict=False):
        oof_cat = np.zeros(len(self.train_X))
        prediction = np.zeros(len(self.test))
        feature_importance_df = pd.DataFrame()
        if self.split_method == 'KFold':
            kfold = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
            iterator = enumerate(kfold.split(self.train_X))

        elif self.split_method == 'StratifiedKFold':
            kfold = StratifiedKFold(n_splits=self.n_splits,shuffle=self.shuffle, random_state=self.random_state)
            iterator = enumerate(kfold.split(self.train_X, self.train_Y.values))

        for fold_, (train_index, val_index) in iterator:
            print('cat fold_{}'.format(fold_ + 1))

            model_cat = CatBoostRegressor(**params)
            if self.contain_cate:
                cate_feture_indices = [self.train_X.columns.get_loc(col) for col in self.cate_features]
                trn_data = cb.Pool(self.train_X.iloc[train_index][self.train_features], self.train_Y[train_index],cat_features=cate_feture_indices)
                val_data = cb.Pool(self.train_X.iloc[val_index][self.train_features], self.train_Y[val_index],cat_features=cate_feture_indices)

            else:
                trn_data = cb.Pool(self.train_X.iloc[train_index][self.train_features], self.train_Y[train_index])
                val_data = cb.Pool(self.train_X.iloc[val_index][self.train_features], self.train_Y[val_index])

            model_cat.fit(trn_data, verbose_eval=400,  eval_set=val_data)

            oof_cat[val_index] = model_cat.predict(self.train_X.iloc[val_index][self.train_features])
            # only when predict is true calculate the featrue importance and predict on test set
            if predict:
                prediction += model_cat.predict(self.test[self.train_features]) / kfold.n_splits
        print('CV score: {:<8.5f}'.format(mean_squared_error(oof_cat, self.train_Y) ** 0.5))

        return mean_squared_error(oof_cat, self.train_Y) ** 0.5, oof_cat,prediction
Exemplo n.º 8
0
def Catboost_train(config,data,param_0,fold_n):
    metric = param_0['metric']
    num_rounds = param_0['n_estimators']
    nFeatures = data.X_train.shape[1]
    X_train, y_train = data.X_train, data.y_train
    X_valid, y_valid = data.X_valid, data.y_valid
    X_test, y_test = data.X_test, data.y_test
    params = {
        #'devices': [0],
        'logging_level': 'Info',
        #'use_best_model': False,
        #'bootstrap_type': 'Bernoulli',
        'random_seed': 42,
        'n_estimators': num_rounds,
    }
    params['custom_metric'] = 'Accuracy'
    if data.problem()=="classification":
        if data.nClasses==2:
            params['loss_function'] = 'Logloss'
        else:
            params['loss_function'] = 'MultiClass'
        model = cat.CatBoostClassifier(**params)
    else:
        params['loss_function'] = 'RMSE'    
        model = cat.CatBoostRegressor(iterations=num_rounds,loss_function='RMSE')
    #model.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_valid, y_valid)],verbose=min(num_rounds//10,100))
    train_pool = cat.Pool(X_train, y_train) 
    valid_pool  = cat.Pool(X_valid,y_valid) 
    model.fit(train_pool, eval_set=valid_pool)
    #pred_val = model.predict(data.X_test)
    
    return model,None
Exemplo n.º 9
0
def cgb_cv(df, features, categorical_features, n_folds, param):
    kf = GroupKFold(n_splits=n_folds)
    group_map = dict(zip(np.arange(1, 13),
                         pd.cut(np.arange(1, 13), n_folds, labels=np.arange(n_folds))))
    group = df.timestamp.dt.month.map(group_map)

    models = []
    train_scores = []
    valid_scores = []

    for train_index, val_index in kf.split(df, df['building_id'], groups=group):
        train_X, train_y = df[features].iloc[train_index], df['meter_reading'].iloc[train_index]
        val_X, val_y = df[features].iloc[val_index], df['meter_reading'].iloc[val_index]

        cgb_train = cgb.Pool(train_X, train_y, cat_features=categorical_features)
        cgb_eval = cgb.Pool(val_X, val_y, cat_features=categorical_features)
        gbm = cgb.train(cgb_train, param, eval_set=cgb_eval, verbose=20)

        train_preds = gbm.predict(train_X)
        if use_log1p_target:
            train_preds = np.expm1(train_preds)
            train_y = np.expm1(train_y)
        train_scores.append(rmsle(train_y, train_preds))

        valid_preds = gbm.predict(val_X)
        if use_log1p_target:
            valid_preds = np.expm1(valid_preds)
            val_y = np.expm1(val_y)
        valid_scores.append(rmsle(val_y, valid_preds))

        models.append(gbm)
    return train_scores, valid_scores, models
Exemplo n.º 10
0
    def __init__(self, data, use_gpu):
        Learner.__init__(self)

        params = {
            'devices': [0],
            'logging_level': 'Info',
            'use_best_model': False,
            'bootstrap_type': 'Bernoulli'
        }

        if use_gpu:
            params['task_type'] = 'GPU'

        if data.task == 'Regression':
            params['loss_function'] = 'RMSE'
        elif data.task == 'Classification':
            params['loss_function'] = 'Logloss'
        elif data.task == 'Multiclass':
            params['loss_function'] = 'MultiClass'

        if data.metric == 'Accuracy':
            params['custom_metric'] = 'Accuracy'

        self.train = cat.Pool(data.X_train, data.y_train)
        self.test = cat.Pool(data.X_test, data.y_test)

        self.default_params = params
Exemplo n.º 11
0
    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwargs):

        self.categorical_columns_indices = categorical_columns_indices
        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.train_data = catboost.Pool(
            data=self.X_train.values,
            label=self.y_train.values,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test.values,
            label=self.y_test.values,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X.values,
            label=self.y.values,
            cat_features=self.categorical_columns_indices)
Exemplo n.º 12
0
def stack_catboost():
    """Стекинг catboost."""
    x_train = add_stacking_feat(load_oof())
    _, y_train = processing.train_set()
    x_test = add_stacking_feat(load_sub())
    x_test.columns = x_train.columns

    x_train.drop(DROP, axis=1, inplace=True)
    x_test.drop(DROP, axis=1, inplace=True)

    pool_test = catboost.Pool(data=x_test,
                              label=None,
                              cat_features=None,
                              weight=None)
    y_oof = pd.Series(0, index=x_train.index, name="oof_y")
    y_pred = pd.Series(0, index=x_test.index, name="time_to_failure")
    trees = []
    scores = []
    feat_importance = 0

    for index_train, index_valid in K_FOLDS.split(x_train):
        pool_train = catboost.Pool(data=x_train.iloc[index_train],
                                   label=y_train.iloc[index_train],
                                   cat_features=None,
                                   weight=None)
        pool_valid = catboost.Pool(data=x_train.iloc[index_valid],
                                   label=y_train.iloc[index_valid],
                                   cat_features=None,
                                   weight=None)
        clf = catboost.CatBoostRegressor(**CLF_PARAMS)
        clf.fit(
            X=pool_train,
            eval_set=[pool_valid],
        )
        trees.append(clf.tree_count_)
        scores.append(clf.best_score_['validation_0']['MAE'])
        y_oof.iloc[index_valid] = clf.predict(pool_valid)
        y_pred += clf.predict(pool_test) / K_FOLDS.get_n_splits()
        feat_importance += pd.DataFrame(
            clf.get_feature_importance(prettified=True),
            columns=["name", "value"
                     ]).set_index("name") / K_FOLDS.get_n_splits()

    LOGGER.info(f"Количество деревьев: {sorted(trees)}")
    LOGGER.info(
        f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}"
    )
    LOGGER.info(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5)))
    LOGGER.info(
        f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.3f}")

    stamp = (
        f"{time.strftime('%Y-%m-%d_%H-%M')}_"
        f"{np.mean(scores):0.3f}_"
        f"{np.mean(scores) + np.std(scores) * 2 / len(scores) ** 0.5:0.3f}_stk"
    )
    y_oof.to_csv(conf.DATA_PROCESSED + f"oof_{stamp}.csv", header=True)
    y_pred.to_csv(conf.DATA_PROCESSED + f"sub_{stamp}.csv", header=True)
    print(feat_importance.sort_values("value", ascending=False))
Exemplo n.º 13
0
    def train(self, X_train, X_valid, y_train, y_valid):
        """ ccc """

        # Convert data to CatBoost Pool format.
        ds_train = ctb.Pool(X_train, y_train)
        ds_valid = ctb.Pool(X_valid, y_valid)

        # Set context dependent CatBoost parameters.
        self.params['dtrain'] = ds_train
        self.params['eval_set'] = ds_valid

        # Train using parameters sent by the user.
        return ctb.train(**self.params)
Exemplo n.º 14
0
def eval_model(X, y, model):
    # confusion matrix
    test_pool = cb.Pool(X, cat_features=np.where(X.dtypes == object)[0])
    predictions = model.predict_proba(test_pool)
    predictions = [1 if ele[1] > 0.5 else 0 for ele in predictions]
    print(sklearn.metrics.classification_report(y, predictions))
    # auc
    plt.style.use('ggplot')
    dataset = cb.Pool(X, y, cat_features=np.where(X.dtypes == object)[0])
    fpr, tpr, _ = cbu.get_roc_curve(model, dataset, plot=True)
    auc = sklearn.metrics.auc(fpr, tpr)
    print('auc: ', auc)
    return auc
Exemplo n.º 15
0
def regression_catboost(train, validate):
    cat_features = ["UserId"]
    # cat_features=[]
    p_train, p_validate = np.log(train["Day30"] / 4 +
                                 1), np.log(validate["Day30"] / 4 + 1)
    # p_train,p_validate=train["Day30"],validate["Day30"]
    train_data = catboost.Pool(train.iloc[:, 1:-31],
                               p_train,
                               cat_features=cat_features)
    validata_data = catboost.Pool(validate.iloc[:, 1:-31],
                                  p_validate,
                                  cat_features=cat_features)

    model = catboost.CatBoostRegressor(iterations=35000,
                                       learning_rate=0.003,
                                       depth=6,
                                       objective="MAPE",
                                       eval_metric="MAPE",
                                       custom_metric=["RMSE", "MAE", "MAPE"],
                                       l2_leaf_reg=3.0,
                                       min_data_in_leaf=1,
                                       boosting_type="Plain",
                                       use_best_model=True,
                                       thread_count=-1,
                                       task_type="GPU",
                                       devices="0",
                                       random_state=random_seed,
                                       verbose=300,
                                       early_stopping_rounds=1000,
                                       fold_permutation_block=1,
                                       bagging_temperature=0)
    # model=catboost.CatBoostRegressor(iterations=100000, learning_rate=0.1, depth=6, objective="RMSE", eval_metric="RMSE",custom_metric=["RMSE","MAE","MAPE"], l2_leaf_reg=3.0, min_data_in_leaf=1, boosting_type="Plain", use_best_model=True, thread_count=-1, task_type="CPU",devices="0", random_state=random_seed, verbose=300, early_stopping_rounds=500)
    model.fit(train_data, eval_set=validata_data, plot=False)

    preds_p_validate = model.predict(validata_data)
    preds_day30 = (np.exp(preds_p_validate) - 1) * 4

    src, _ = spearmanr(validate["Day30"], preds_day30)

    df_important = pd.DataFrame({
        "feature_name": model.feature_names_,
        "importance": model.feature_importances_
    })
    df_important = df_important.sort_values(by=["importance"], ascending=False)
    print(df_important)
    df_predict_day30 = pd.DataFrame({
        "FlickrId": validate["FlickrId"],
        "Day30": validate["Day30"],
        "preds_day30": preds_day30
    })
    return model, df_predict_day30
def main(readcsv=pd_read_csv, method='defaultDense'):
    # Path to data
    train_file = "./data/batch/df_classification_train.csv"
    test_file = "./data/batch/df_classification_test.csv"

    # Data reading
    X_train = readcsv(train_file, range(3), t=np.float32)
    y_train = readcsv(train_file, range(3, 4), t=np.float32)
    X_test = readcsv(test_file, range(3), t=np.float32)
    y_test = readcsv(test_file, range(3, 4), t=np.float32)

    # Datasets creation
    cb_train = cb.Pool(X_train, label=np.array(y_train))
    cb_test = cb.Pool(X_test, label=np.array(y_test))

    # training parameters setting
    params = {
        'reg_lambda': 1,
        'max_depth': 8,
        'num_leaves': 2**8,
        'verbose': 0,
        'objective': 'MultiClass',
        'learning_rate': 0.3,
        'n_estimators': 100,
        'classes_count': 5,
    }

    # Training
    cb_model = cb.CatBoost(params)
    cb_model.fit(cb_train)

    # Catboost prediction
    cb_prediction = cb_model.predict(cb_test, prediction_type='Class').T[0]
    cb_errors_count = np.count_nonzero(cb_prediction - np.ravel(y_test))

    # Conversion to daal4py
    daal_model = d4p.get_gbt_model_from_catboost(cb_model)

    # daal4py prediction
    daal_predict_algo = d4p.gbt_classification_prediction(
        nClasses=params['classes_count'],
        resultsToEvaluate="computeClassLabels",
        fptype='float')
    daal_prediction = daal_predict_algo.compute(X_test, daal_model)
    daal_errors_count = np.count_nonzero(daal_prediction.prediction - y_test)
    assert np.absolute(cb_errors_count - daal_errors_count) == 0

    return (cb_prediction, cb_errors_count,
            np.ravel(daal_prediction.prediction), daal_errors_count,
            np.ravel(y_test))
Exemplo n.º 17
0
def train_catboost(rank_model,
                   net_model,
                   emb,
                   train_df,
                   val_df,
                   model_file=None,
                   use_cache=False):

    if use_cache:
        X_train = np.load('datasets/X_train_boost.npy')
        X_val = np.load('datasets/X_val_boost.npy')
        y_train = np.load('datasets/y_train_boost.npy')
        y_val = np.load('datasets/y_val_boost.npy')
        print('Datasets are loaded')
    else:
        X_train, y_train = make_dataset(train_df, net_model, emb)
        X_val, y_val = make_dataset(val_df, net_model, emb)

        np.save('datasets/X_train_boost', X_train)
        np.save('datasets/X_val_boost', X_val)
        np.save('datasets/y_train_boost', y_train)
        np.save('datasets/y_val_boost', y_val)

        print('Datasets are saved')

    groups_train = train_df['0'].values % int(1e9 + 7)
    groups_val = val_df['0'].values % int(1e9 + 7)

    train_pool = catboost.Pool(X_train,
                               y_train * 0.5,
                               group_id=groups_train.reshape(-1),
                               weight=train_df['7'].values.reshape(-1))

    val_pool = catboost.Pool(X_val,
                             y_val * 0.5,
                             group_id=groups_val.reshape(-1))

    rank_model.fit(train_pool,
                   eval_set=val_pool,
                   plot=False,
                   logging_level='Verbose')

    if model_file is not None:
        with open(model_file, 'wb') as f:
            pickle.dump([rank_model], f, -1)

    y_score = rank_model.predict(val_pool)
    res = ranking.ndcg(y_val, y_score, groups_val)
    return res, rank_model
    def run_gridsearch(self, cv, cv_score: str) -> None:
        """
		Performs a gridsearch over the tuning hyperparameters. Determines the 
		best hyperparameters based on the average validation performance 
		calculated over cross-validation folds.
		:param cv: A cross-validarion generator that determines the 
		cross-validation strategy.
		:param cv_score: Measure to evaluate predictions on the validation set. 
		"""
        cats = self.X_tr.columns[self.X_tr.dtypes == "category"]
        cat_features = [
            list(self.X_tr).index(cats[i]) for i in range(len(cats))
        ]
        params = self.fixed_params.copy()

        best_AUC = 0.5
        for tune in ParameterGrid(self.tuning_params):
            params.update(tune)

            AUC_val = []
            for train, val in cv.split(self.X_tr, self.y_tr):
                X_train, y_train = self.X_tr.iloc[train], self.y_tr.iloc[train]
                X_val, y_val = self.X_tr.iloc[val], self.y_tr.iloc[val]
                train_pool = cat.Pool(X_train,
                                      y_train,
                                      cat_features=cat_features)
                validate_pool = cat.Pool(X_val,
                                         y_val,
                                         cat_features=cat_features)

                model = cat.CatBoostClassifier(**params)
                model.fit(train_pool,
                          eval_set=validate_pool,
                          logging_level="Silent")

                validation_AUC = calc_perf_score(
                    data=X_val,
                    labels=np.array(y_val.astype("float")),
                    model=model,
                    model_name=self.name,
                    score_name=cv_score,
                )
                AUC_val.append(validation_AUC)

            AUC_val = np.mean(AUC_val)

            if AUC_val > best_AUC:
                best_AUC = AUC_val
                self.best_tuning_params = tune
Exemplo n.º 19
0
def train_model(xtrain, xval, cat_fts, params):
    y_trn = xtrain['target'].values
    y_val = xval['target'].values
    del xtrain['target'], xval['target']

    categorical_ind = [k for k, v in enumerate(xtrain.columns) if v in cat_fts]

    # train model
    clf = cat.CatBoostClassifier(**params)
    clf.fit(xtrain.values,
            y_trn,
            cat_features=categorical_ind,
            eval_set=(xval.values, y_val),
            early_stopping_rounds=100,
            verbose=100,
            plot=False)
    print('Done!')
    print('Grab feature importance for both train and val')
    # get feature importance
    trn_imp = clf.get_feature_importance(data=cat.Pool(
        data=xtrain, cat_features=categorical_ind),
                                         prettified=True)
    val_imp = clf.get_feature_importance(data=cat.Pool(
        data=xval, cat_features=categorical_ind),
                                         prettified=True)
    plot_imp(trn_imp, 'train')
    plot_imp(val_imp, 'val')
    print('Done feature imp')

    # make prediction on validation set
    val_pred = clf.predict_proba(xval.values)[:, 1]
    logloss_i = log_loss(y_val, val_pred)
    # compute roc auc
    fpr, tpr, thresholds = roc_curve(y_val, val_pred, pos_label=1)
    auc_i = auc(fpr, tpr)
    # compute map
    map_i = average_precision_score(y_val, val_pred)
    print('logloss={0:.4f} | map={1:.4f} | auc={2:.4f}'.format(
        logloss_i, map_i, auc_i))

    # mrr
    print('reciproical rank for validation set')
    xval['pred'] = val_pred
    xval['target'] = y_val
    val_rr = xval.groupby(level=0).apply(reciprocal_rank)
    mrr = (1 / val_rr[val_rr != 0]).mean()
    print(f'Mean reciporical rank on validation set: {mrr:.4f}')
    return clf, categorical_ind, mrr
Exemplo n.º 20
0
def catboosttrainer(X,
                    y,
                    features,
                    initparam,
                    modelname,
                    modelpath,
                    docpath,
                    cvfold=5):
    print("searching for optimal iteration count...")
    trainpool = cat.Pool(X[features], y)
    cvresult = cat.cv(params=initparam,
                      fold_count=cvfold,
                      pool=trainpool,
                      stratified=True)
    initparam['iterations'] = (len(cvresult)) - (initparam['od_wait'] + 1)
    del initparam['od_wait']
    del initparam['od_type']
    print("optimal iteration count is ", initparam['iterations'])
    print("fitting model...")
    clf = cat.CatBoostClassifier(**initparam)
    clf.fit(trainpool)
    imp = clf.get_feature_importance(trainpool, fstr_type='FeatureImportance')
    dfimp = pd.DataFrame(imp, columns=['CatBoostImportance'])
    dfimp.insert(0, column='Feature', value=features)
    dfimp = dfimp.sort_values(['CatBoostImportance', 'Feature'],
                              ascending=False)
    xlsxpath = os.path.join(docpath, modelname + ".xlsx")
    dfimp.to_excel(xlsxpath)
    print("pickling model...")
    picklepath = os.path.join(modelpath, modelname)
    with open(picklepath, 'wb') as fout:
        pickle.dump(clf, fout)
    return cvresult, clf, initparam, dfimp
Exemplo n.º 21
0
def classification_model(raw_data_file, metric_col, categorical_col,
                         target_col, test_perc, hyperopt_iterations,
                         const_params, use_predefined_params, k_fold,
                         tuning_metric):
    # preprocess data
    print('preprocess data:')
    data_obj = Preproc(raw_data_file, metric_col, categorical_col, target_col,
                       test_perc)

    # hyperparameter tuning train with best params
    print('hyperparams tuning and model fitting:')
    model, params = train_best_model(data_obj.X_train, data_obj.y_train,
                                     const_params, hyperopt_iterations, k_fold,
                                     tuning_metric, use_predefined_params)
    print('best params are {}'.format(params), file=sys.stdout)

    # evaluate model
    auc = eval_model(data_obj.X_test, data_obj.y_test, model)

    # save model
    model.save_model(save_model_dir,
                     format="json",
                     pool=cb.Pool(data_obj.X_train,
                                  data_obj.y_train,
                                  cat_features=np.where(
                                      data_obj.X_train.dtypes == object)[0]))
Exemplo n.º 22
0
def train_best_model(X, y, const_params, max_evals=10, use_default=False):
    # convert pandas.DataFrame to catboost.Pool to avoid converting it on each
    # iteration of hyper-parameters optimization
    dataset = cb.Pool(X, y)

    if use_default:
        # pretrained optimal parameters
        best = {
            'depth': 3,
            'fold_len_multiplier': 41.1,
            'iterations': 50,
            'learning_rate': 0.1
        }
    else:
        best = find_best_hyper_params(dataset,
                                      const_params,
                                      max_evals=max_evals)

    # merge subset of hyper-parameters provided by hyperopt with hyper-parameters
    # provided by the user
    hyper_params = best.copy()
    hyper_params.update(const_params)

    # drop `use_best_model` because we are going to use entire dataset for
    # training of the final model
    hyper_params.pop('use_best_model', None)

    model = cb.CatBoostClassifier(**hyper_params)
    model.fit(dataset, verbose=False)

    return model, hyper_params
Exemplo n.º 23
0
    def predict(self, model, test_set):
        """ xxx """

        ds_test = ctb.Pool(test_set)

        # Make predictions using the best training round
        return model.predict(data=ds_test, prediction_type='Probability')[:, 1]
Exemplo n.º 24
0
def test_catboost_numerical_validation():
    ds = vaex.ml.datasets.load_iris()
    features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width']

    # Vanilla catboost
    dtrain = cb.Pool(ds[features].values, label=ds.data.class_)
    cb_bst = cb.train(params=params_multiclass,
                      dtrain=dtrain,
                      num_boost_round=3)
    cb_pred = cb_bst.predict(dtrain, prediction_type='Probability')

    # catboost through vaex
    booster = vaex.ml.catboost.CatBoostModel(features=features,
                                             params=params_multiclass,
                                             num_boost_round=3)
    booster.fit(ds, ds.class_)
    vaex_pred = booster.predict(ds)

    # Comparing the the predictions of catboost vs vaex.ml
    np.testing.assert_equal(
        vaex_pred,
        cb_pred,
        verbose=True,
        err_msg=
        'The predictions of vaex.ml.catboost do not match those of pure catboost'
    )
Exemplo n.º 25
0
def simple_on_dataframe():
    learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1]
    cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1]

    try:
        utils.object_list_to_tsv([(0.1, 0.2, 0.11, 0.12),
                                  (0.97, 0.82, 0.33, 1.1),
                                  (0.13, 0.22, 0.23, 2.1),
                                  (0.14, 0.18, 0.1, 0.0),
                                  (0.9, 0.67, 0.17, -1.0),
                                  (0.66, 0.1, 0.31, 0.62)], learn_set_path)
        with open(cd_path, 'w') as cd:
            cd.write('3\tTarget')

        model = utils.run_dist_train([
            '--iterations', '20', '--loss-function', 'RMSE', '--learn-set',
            learn_set_path, '--cd', cd_path
        ],
                                     model_class=cb.CatBoostRegressor)
        train_pool = cb.Pool(learn_set_path, column_description=cd_path)

        result = {'prediction': model.predict(train_pool).tolist()}

        json.dump(result,
                  fp=open(
                      os.path.join(OUTPUT_DIR,
                                   'regression_simple_on_dataframe.json'),
                      'w'),
                  allow_nan=True,
                  indent=2)

    finally:
        os.remove(learn_set_path)
        os.remove(cd_path)
Exemplo n.º 26
0
def main():
    if len(sys.argv) != 4:
        sys.stderr.write("Arguments error. Usage:\n")
        sys.stderr.write("\t python src/train.py data features model\n")
        sys.exit(1)

    with open(os.path.join(sys.argv[2], "cat_features.bin"), "rb") as fd:
        features = pickle.load(fd)
    df = pandas.read_csv(os.path.join(sys.argv[1], "train.csv"))
    data = catboost.Pool(df.drop(["price_usd"], 1),
                         label=df.price_usd,
                         cat_features=features)

    default_params = {"n_estimators": 4500, "learning_rate": 0.1}
    params = yaml.safe_load(open("params.yaml"))["train"]
    cb_params = params["regression"]
    cb_params.update(default_params)

    model = catboost.CatBoostRegressor(custom_metric=["R2", "RMSE"],
                                       task_type="GPU",
                                       random_seed=params["seed"],
                                       **cb_params)
    model.fit(data, verbose=True)

    model.save_model(sys.argv[3])
Exemplo n.º 27
0
 def predict(self, data):
     test_X = data[self.features_name]
     cat_test = cat.Pool(data=test_X,
                         feature_names=self.features_name,
                         cat_features=self.categorical_feature)
     prediction = self.estimator.predict(cat_test)
     return prediction
Exemplo n.º 28
0
def main():
    model = cb.CatBoostRegressor()
    # The data was mined by data_miner.py in Data prepocessing/Scripts
    data = pd.read_csv("data.csv", sep=";", encoding="utf8")
    data = data.drop(["coors"], axis=1)
    classes = {}
    # Some classes need to be mined from cian.ru
    for _class in ["Конструктив и состояние", "Положительное соседство", "Отрицательное соседство",
                   "Квартиры и планировки", "Инфраструктура", "Безопасность", "Транспорт", "Экология", "price_per_m"]:
        if _class in data.columns:
            classes[_class] = data[_class]
            data = data.drop([_class], axis=1)
    # Train models for each class
    for _class in classes:
        x_train, x_test_val, y_train, y_test_val = train_test_split(data, classes[_class], test_size=0.2, random_state=7)
        x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=0.7)

        model.fit(x_train, y_train,
                  use_best_model=True,
                  eval_set=cb.Pool(x_val, y_val),
                  logging_level="Verbose",  # 'Silent', 'Verbose', 'Info', 'Debug'
                  early_stopping_rounds=1,
                  save_snapshot=True,
                  snapshot_file="backup.cbsnapshot",
                  snapshot_interval=300,
                  )
        print(model.score(x_test, y_test))
        model.save_model("trained_model", format="cbm")
Exemplo n.º 29
0
    def get_train_set(self,
                      as_xgb_dmatrix=False,
                      as_lgb_dataset=False,
                      as_cgb_pool=False):
        df = self._get_cleaned_single_set(dataset="train")
        train_cols = df.columns.tolist()
        train_cols.remove("target")

        if self.drop_lowimp_features:
            print('Dropping low importance features !')
            dropcols = set(df.columns.tolist()).intersection(
                set(LOW_IMPORTANCE_FEATURES))
            df = df.drop(columns=list(dropcols))

        if as_xgb_dmatrix:
            return xgb.DMatrix(data=df[train_cols], label=df[["target"]])
        elif as_lgb_dataset:
            return lgb.Dataset(df[train_cols], df[["target"]].values.ravel())
        elif as_cgb_pool:
            with Timer('Creating Pool for Train set CatBoost'):
                df, catboost_features = self._generate_catboost_df(df)
                idx_cat_features = list(range(len(catboost_features)))
                pool = cgb.Pool(
                    df.drop(columns=["target"]),
                    df["target"],
                    idx_cat_features)
            return pool
        else:
            return df[train_cols], df[["target"]]
Exemplo n.º 30
0
def partial_dependence_curve(tickers: Tuple[str, ...], date: pd.Timestamp):
    """Рисует кривые частичной зависимости для численных параметров.

    :param tickers:
        Тикеры, для которых необходимо составить ML-модель.
    :param date:
        Дата, на которую составляется ML-модель.
    """
    params = config.ML_PARAMS
    cases = examples.Examples(tickers, date, params["data"])
    clf, train_pool_params = train_clf(cases, params)
    n_plots = len(train_pool_params["data"].columns) - len(
        cases.categorical_features())
    axs = axs_iter(n_plots)
    results = []
    for n, name in enumerate(train_pool_params["data"]):
        if n in cases.categorical_features():
            continue
        ax = next(axs)
        pool_params = copy.deepcopy(train_pool_params)
        quantiles = pool_params["data"].iloc[:, n].quantile(QUANTILE).values
        y = []
        for quantile in quantiles:
            pool_params["data"].iloc[:, n] = quantile
            predict_pool = catboost.Pool(**pool_params)
            raw_prediction = clf.predict(predict_pool)
            prediction = (raw_prediction * pool_params["data"].iloc[:, 0] *
                          YEAR_IN_TRADING_DAYS)
            y.append(prediction.values.mean())
        ax.set_title(f"{name}")
        ax.tick_params(labelleft=True)
        ax.plot(quantiles, y)
        results.append((quantiles, y))
    plt.show()
    return results