def fit(self, train_df, valid_df=None, train_dir='.', niter=10000, seed=123): if self.model is None: params = { 'loss_function': 'RMSE', 'task_type': 'GPU', 'iterations': niter, 'verbose': True, 'train_dir': train_dir, 'random_seed': seed } self.model = catboost.CatBoost(params) init_model = None else: init_model = self.model train_features, train_labels = get_feature_label(train_df) train_pool = catboost.Pool(data=train_features, label=train_labels) if valid_df is not None: valid_features, valid_labels = get_feature_label(valid_df) dev_pool = catboost.Pool(data=valid_features, label=valid_labels) else: dev_pool = None self.model.fit(train_pool, eval_set=dev_pool, init_model=init_model)
def cat_cv(train, test, params, fit_params, cat_features, feature_names, nfold, seed): train.Pred = pd.DataFrame({ 'id': train['样本id'], 'true': train['收率'], 'pred': np.zeros(len(train)) }) test.Pred = pd.DataFrame({'id': test['样本id'], 'pred': np.zeros(len(test))}) kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed) cat_tst = cat.Pool(data=test[feature_names], cat_features=cat_features, feature_names=feature_names) for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train['收率'])): print(f'\nFold_{fold_id} Training ================================\n') cat_trn = cat.Pool(data=train.iloc[trn_idx][feature_names], label=train.iloc[trn_idx]['收率'], cat_features=cat_features, feature_names=feature_names) cat_val = cat.Pool(data=train.iloc[val_idx][feature_names], label=train.iloc[val_idx]['收率'], cat_features=cat_features, feature_names=feature_names) cat.train(params=params, pool=cat_trn, **fit_params, eval_set=cat_val) val_pred = cat.predict(train.iloc[val_idx][feature_names]) train.Pred.loc[val_idx, 'pred'] = val_pred print(f'Fold_{fold_id}', mse(train.iloc[val_idx]['收率'], val_pred)) test.Pred['pred'] += cat.predict(cat_tst) / nfold print('\n\nCV LOSS:', mse(train.Pred['true'], train.Pred['pred'])) return test.Pred
def train_and_predict_catboost(): (train_X, train_y), (test_X, test_y) = get_data(scaler='none', one_hot=False, convt_cat=False) train_y = train_y.flatten() test_y = test_y.flatten() cat_features = [1, 2, 4, 5, 6] train_pool = catboost.Pool(data=train_X, label=train_y, cat_features=cat_features) test_pool = catboost.Pool(data=test_X, label=test_y, cat_features=cat_features) model = catboost.CatBoostClassifier(loss_function='MultiClass', depth=None, random_seed=42, cat_features=[1, 2, 4, 5, 6], silent=False) model.fit(train_pool) y_pred = model.predict(test_pool) y_true = test_y print(y_pred.shape) print(np.unique(y_pred)) helpers.evaluate_clf(y_true, y_pred) pred_for_train = model.predict(train_X) pred_for_test = model.predict(test_X) return pred_for_train, pred_for_test
def fit(self, data, clf=None): setseed(self.seed) train_df = data[data['visitId'].\ apply(lambda x: x.date()) >= datetime.date(2016,9,30)] val_df = data[data['visitId'].\ apply(lambda x: x.date()) < datetime.date(2016,9,30)] if clf: train_X, train_y = \ train_df[self.features_name].values,\ train_df['validRevenue'].values\ val_X, val_y = \ val_df[self.features_name].values,\ val_df['validRevenue'].values else: train_X, train_y = \ train_df[self.features_name].values,\ train_df['totals_transactionRevenue'].values val_X, val_y = \ val_df[self.features_name].values,\ val_df['totals_transactionRevenue'].values # x_train, x_eval, y_train, y_eval = \ # train_test_split(df_x, df_y, # test_size = self.test_ratio, # random_state = self.seed) cat_train = cat.Pool(data=train_X, label=train_y, feature_names=self.features_name, cat_features=self.categorical_feature) cat_eval = cat.Pool(data=val_X, label=val_y, feature_names=self.features_name, cat_features=self.categorical_feature) model_param = self.params['params_clf'] if clf else self.params[ 'params_reg'] self.estimator = cat.train( params=model_param, pool=cat_train, eval_set=cat_eval, # num_boost_round = self.num_boost_round, # learning_rate = self.params['learning_rate'], # max_depth = self.params['max_depth'], # l2_leaf_reg = self.params['l2_leaf_reg'], # rsm = self.params['colsample_ratio'], # subsample = self.params['subsample_ratio'], # class_weights = self.params['class_weights'], # loss_function = self.loglikeloss, # custom_loss = self.loglikeloss, # custom_metric = self.roc_auc_error, # eval_metric = self.roc_auc_error ) return self
def __init__(self, data, task, metric, use_gpu): Learner.__init__(self) params = { 'devices': [0], 'logging_level': 'Verbose', 'use_best_model': False, 'bootstrap_type': 'Bernoulli', 'random_seed': RANDOM_SEED } if use_gpu: params['task_type'] = 'GPU' if task == 'regression': params['loss_function'] = 'RMSE' elif task == 'binclass': params['loss_function'] = 'Logloss' elif task == 'multiclass': params['loss_function'] = 'MultiClass' if metric == 'Accuracy': params['custom_metric'] = 'Accuracy' self.train = cat.Pool(data.X_train, data.y_train) self.test = cat.Pool(data.X_test, data.y_test) self.default_params = params
def train_model( df_train, df_valid, model_params, general_params, ): train_pool = catboost.Pool(df_train["text"].values, label=df_train["label"].values, text_features=[0]) valid_pool = catboost.Pool(df_valid["text"].values, label=df_valid["label"].values, text_features=[0]) model_params = copy.deepcopy(model_params) model_params.update({"train_dir": general_params["logdir"]}) model = catboost.train( pool=train_pool, eval_set=valid_pool, params=model_params, verbose=False, plot=False, ) model.save_model(os.path.join(general_params["logdir"], "model.cbm"))
def _train(self, params = None, predict=False): oof_cat = np.zeros(len(self.train_X)) prediction = np.zeros(len(self.test)) feature_importance_df = pd.DataFrame() if self.split_method == 'KFold': kfold = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state) iterator = enumerate(kfold.split(self.train_X)) elif self.split_method == 'StratifiedKFold': kfold = StratifiedKFold(n_splits=self.n_splits,shuffle=self.shuffle, random_state=self.random_state) iterator = enumerate(kfold.split(self.train_X, self.train_Y.values)) for fold_, (train_index, val_index) in iterator: print('cat fold_{}'.format(fold_ + 1)) model_cat = CatBoostRegressor(**params) if self.contain_cate: cate_feture_indices = [self.train_X.columns.get_loc(col) for col in self.cate_features] trn_data = cb.Pool(self.train_X.iloc[train_index][self.train_features], self.train_Y[train_index],cat_features=cate_feture_indices) val_data = cb.Pool(self.train_X.iloc[val_index][self.train_features], self.train_Y[val_index],cat_features=cate_feture_indices) else: trn_data = cb.Pool(self.train_X.iloc[train_index][self.train_features], self.train_Y[train_index]) val_data = cb.Pool(self.train_X.iloc[val_index][self.train_features], self.train_Y[val_index]) model_cat.fit(trn_data, verbose_eval=400, eval_set=val_data) oof_cat[val_index] = model_cat.predict(self.train_X.iloc[val_index][self.train_features]) # only when predict is true calculate the featrue importance and predict on test set if predict: prediction += model_cat.predict(self.test[self.train_features]) / kfold.n_splits print('CV score: {:<8.5f}'.format(mean_squared_error(oof_cat, self.train_Y) ** 0.5)) return mean_squared_error(oof_cat, self.train_Y) ** 0.5, oof_cat,prediction
def Catboost_train(config,data,param_0,fold_n): metric = param_0['metric'] num_rounds = param_0['n_estimators'] nFeatures = data.X_train.shape[1] X_train, y_train = data.X_train, data.y_train X_valid, y_valid = data.X_valid, data.y_valid X_test, y_test = data.X_test, data.y_test params = { #'devices': [0], 'logging_level': 'Info', #'use_best_model': False, #'bootstrap_type': 'Bernoulli', 'random_seed': 42, 'n_estimators': num_rounds, } params['custom_metric'] = 'Accuracy' if data.problem()=="classification": if data.nClasses==2: params['loss_function'] = 'Logloss' else: params['loss_function'] = 'MultiClass' model = cat.CatBoostClassifier(**params) else: params['loss_function'] = 'RMSE' model = cat.CatBoostRegressor(iterations=num_rounds,loss_function='RMSE') #model.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_valid, y_valid)],verbose=min(num_rounds//10,100)) train_pool = cat.Pool(X_train, y_train) valid_pool = cat.Pool(X_valid,y_valid) model.fit(train_pool, eval_set=valid_pool) #pred_val = model.predict(data.X_test) return model,None
def cgb_cv(df, features, categorical_features, n_folds, param): kf = GroupKFold(n_splits=n_folds) group_map = dict(zip(np.arange(1, 13), pd.cut(np.arange(1, 13), n_folds, labels=np.arange(n_folds)))) group = df.timestamp.dt.month.map(group_map) models = [] train_scores = [] valid_scores = [] for train_index, val_index in kf.split(df, df['building_id'], groups=group): train_X, train_y = df[features].iloc[train_index], df['meter_reading'].iloc[train_index] val_X, val_y = df[features].iloc[val_index], df['meter_reading'].iloc[val_index] cgb_train = cgb.Pool(train_X, train_y, cat_features=categorical_features) cgb_eval = cgb.Pool(val_X, val_y, cat_features=categorical_features) gbm = cgb.train(cgb_train, param, eval_set=cgb_eval, verbose=20) train_preds = gbm.predict(train_X) if use_log1p_target: train_preds = np.expm1(train_preds) train_y = np.expm1(train_y) train_scores.append(rmsle(train_y, train_preds)) valid_preds = gbm.predict(val_X) if use_log1p_target: valid_preds = np.expm1(valid_preds) val_y = np.expm1(val_y) valid_scores.append(rmsle(val_y, valid_preds)) models.append(gbm) return train_scores, valid_scores, models
def __init__(self, data, use_gpu): Learner.__init__(self) params = { 'devices': [0], 'logging_level': 'Info', 'use_best_model': False, 'bootstrap_type': 'Bernoulli' } if use_gpu: params['task_type'] = 'GPU' if data.task == 'Regression': params['loss_function'] = 'RMSE' elif data.task == 'Classification': params['loss_function'] = 'Logloss' elif data.task == 'Multiclass': params['loss_function'] = 'MultiClass' if data.metric == 'Accuracy': params['custom_metric'] = 'Accuracy' self.train = cat.Pool(data.X_train, data.y_train) self.test = cat.Pool(data.X_test, data.y_test) self.default_params = params
def dataset(self, X, y, categorical_columns_indices=None, test_size=0.2, *args, **kwargs): self.categorical_columns_indices = categorical_columns_indices self.X = X self.columns = list(X) self.y, self.cat_replace = self.replace_multiclass(y) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=test_size, random_state=self.random_state) self.train_data = catboost.Pool( data=self.X_train.values, label=self.y_train.values, cat_features=self.categorical_columns_indices) self.eval_data = catboost.Pool( data=self.X_test.values, label=self.y_test.values, cat_features=self.categorical_columns_indices) self.all_train_data = catboost.Pool( data=self.X.values, label=self.y.values, cat_features=self.categorical_columns_indices)
def stack_catboost(): """Стекинг catboost.""" x_train = add_stacking_feat(load_oof()) _, y_train = processing.train_set() x_test = add_stacking_feat(load_sub()) x_test.columns = x_train.columns x_train.drop(DROP, axis=1, inplace=True) x_test.drop(DROP, axis=1, inplace=True) pool_test = catboost.Pool(data=x_test, label=None, cat_features=None, weight=None) y_oof = pd.Series(0, index=x_train.index, name="oof_y") y_pred = pd.Series(0, index=x_test.index, name="time_to_failure") trees = [] scores = [] feat_importance = 0 for index_train, index_valid in K_FOLDS.split(x_train): pool_train = catboost.Pool(data=x_train.iloc[index_train], label=y_train.iloc[index_train], cat_features=None, weight=None) pool_valid = catboost.Pool(data=x_train.iloc[index_valid], label=y_train.iloc[index_valid], cat_features=None, weight=None) clf = catboost.CatBoostRegressor(**CLF_PARAMS) clf.fit( X=pool_train, eval_set=[pool_valid], ) trees.append(clf.tree_count_) scores.append(clf.best_score_['validation_0']['MAE']) y_oof.iloc[index_valid] = clf.predict(pool_valid) y_pred += clf.predict(pool_test) / K_FOLDS.get_n_splits() feat_importance += pd.DataFrame( clf.get_feature_importance(prettified=True), columns=["name", "value" ]).set_index("name") / K_FOLDS.get_n_splits() LOGGER.info(f"Количество деревьев: {sorted(trees)}") LOGGER.info( f"Среднее количество деревьев: {np.mean(trees):.0f} +/- {np.std(trees):.0f}" ) LOGGER.info(f"MAE на кроссвалидации: " + str(np.round(sorted(scores), 5))) LOGGER.info( f"MAE среднее: {np.mean(scores):0.3f} +/- {np.std(scores):0.3f}") stamp = ( f"{time.strftime('%Y-%m-%d_%H-%M')}_" f"{np.mean(scores):0.3f}_" f"{np.mean(scores) + np.std(scores) * 2 / len(scores) ** 0.5:0.3f}_stk" ) y_oof.to_csv(conf.DATA_PROCESSED + f"oof_{stamp}.csv", header=True) y_pred.to_csv(conf.DATA_PROCESSED + f"sub_{stamp}.csv", header=True) print(feat_importance.sort_values("value", ascending=False))
def train(self, X_train, X_valid, y_train, y_valid): """ ccc """ # Convert data to CatBoost Pool format. ds_train = ctb.Pool(X_train, y_train) ds_valid = ctb.Pool(X_valid, y_valid) # Set context dependent CatBoost parameters. self.params['dtrain'] = ds_train self.params['eval_set'] = ds_valid # Train using parameters sent by the user. return ctb.train(**self.params)
def eval_model(X, y, model): # confusion matrix test_pool = cb.Pool(X, cat_features=np.where(X.dtypes == object)[0]) predictions = model.predict_proba(test_pool) predictions = [1 if ele[1] > 0.5 else 0 for ele in predictions] print(sklearn.metrics.classification_report(y, predictions)) # auc plt.style.use('ggplot') dataset = cb.Pool(X, y, cat_features=np.where(X.dtypes == object)[0]) fpr, tpr, _ = cbu.get_roc_curve(model, dataset, plot=True) auc = sklearn.metrics.auc(fpr, tpr) print('auc: ', auc) return auc
def regression_catboost(train, validate): cat_features = ["UserId"] # cat_features=[] p_train, p_validate = np.log(train["Day30"] / 4 + 1), np.log(validate["Day30"] / 4 + 1) # p_train,p_validate=train["Day30"],validate["Day30"] train_data = catboost.Pool(train.iloc[:, 1:-31], p_train, cat_features=cat_features) validata_data = catboost.Pool(validate.iloc[:, 1:-31], p_validate, cat_features=cat_features) model = catboost.CatBoostRegressor(iterations=35000, learning_rate=0.003, depth=6, objective="MAPE", eval_metric="MAPE", custom_metric=["RMSE", "MAE", "MAPE"], l2_leaf_reg=3.0, min_data_in_leaf=1, boosting_type="Plain", use_best_model=True, thread_count=-1, task_type="GPU", devices="0", random_state=random_seed, verbose=300, early_stopping_rounds=1000, fold_permutation_block=1, bagging_temperature=0) # model=catboost.CatBoostRegressor(iterations=100000, learning_rate=0.1, depth=6, objective="RMSE", eval_metric="RMSE",custom_metric=["RMSE","MAE","MAPE"], l2_leaf_reg=3.0, min_data_in_leaf=1, boosting_type="Plain", use_best_model=True, thread_count=-1, task_type="CPU",devices="0", random_state=random_seed, verbose=300, early_stopping_rounds=500) model.fit(train_data, eval_set=validata_data, plot=False) preds_p_validate = model.predict(validata_data) preds_day30 = (np.exp(preds_p_validate) - 1) * 4 src, _ = spearmanr(validate["Day30"], preds_day30) df_important = pd.DataFrame({ "feature_name": model.feature_names_, "importance": model.feature_importances_ }) df_important = df_important.sort_values(by=["importance"], ascending=False) print(df_important) df_predict_day30 = pd.DataFrame({ "FlickrId": validate["FlickrId"], "Day30": validate["Day30"], "preds_day30": preds_day30 }) return model, df_predict_day30
def main(readcsv=pd_read_csv, method='defaultDense'): # Path to data train_file = "./data/batch/df_classification_train.csv" test_file = "./data/batch/df_classification_test.csv" # Data reading X_train = readcsv(train_file, range(3), t=np.float32) y_train = readcsv(train_file, range(3, 4), t=np.float32) X_test = readcsv(test_file, range(3), t=np.float32) y_test = readcsv(test_file, range(3, 4), t=np.float32) # Datasets creation cb_train = cb.Pool(X_train, label=np.array(y_train)) cb_test = cb.Pool(X_test, label=np.array(y_test)) # training parameters setting params = { 'reg_lambda': 1, 'max_depth': 8, 'num_leaves': 2**8, 'verbose': 0, 'objective': 'MultiClass', 'learning_rate': 0.3, 'n_estimators': 100, 'classes_count': 5, } # Training cb_model = cb.CatBoost(params) cb_model.fit(cb_train) # Catboost prediction cb_prediction = cb_model.predict(cb_test, prediction_type='Class').T[0] cb_errors_count = np.count_nonzero(cb_prediction - np.ravel(y_test)) # Conversion to daal4py daal_model = d4p.get_gbt_model_from_catboost(cb_model) # daal4py prediction daal_predict_algo = d4p.gbt_classification_prediction( nClasses=params['classes_count'], resultsToEvaluate="computeClassLabels", fptype='float') daal_prediction = daal_predict_algo.compute(X_test, daal_model) daal_errors_count = np.count_nonzero(daal_prediction.prediction - y_test) assert np.absolute(cb_errors_count - daal_errors_count) == 0 return (cb_prediction, cb_errors_count, np.ravel(daal_prediction.prediction), daal_errors_count, np.ravel(y_test))
def train_catboost(rank_model, net_model, emb, train_df, val_df, model_file=None, use_cache=False): if use_cache: X_train = np.load('datasets/X_train_boost.npy') X_val = np.load('datasets/X_val_boost.npy') y_train = np.load('datasets/y_train_boost.npy') y_val = np.load('datasets/y_val_boost.npy') print('Datasets are loaded') else: X_train, y_train = make_dataset(train_df, net_model, emb) X_val, y_val = make_dataset(val_df, net_model, emb) np.save('datasets/X_train_boost', X_train) np.save('datasets/X_val_boost', X_val) np.save('datasets/y_train_boost', y_train) np.save('datasets/y_val_boost', y_val) print('Datasets are saved') groups_train = train_df['0'].values % int(1e9 + 7) groups_val = val_df['0'].values % int(1e9 + 7) train_pool = catboost.Pool(X_train, y_train * 0.5, group_id=groups_train.reshape(-1), weight=train_df['7'].values.reshape(-1)) val_pool = catboost.Pool(X_val, y_val * 0.5, group_id=groups_val.reshape(-1)) rank_model.fit(train_pool, eval_set=val_pool, plot=False, logging_level='Verbose') if model_file is not None: with open(model_file, 'wb') as f: pickle.dump([rank_model], f, -1) y_score = rank_model.predict(val_pool) res = ranking.ndcg(y_val, y_score, groups_val) return res, rank_model
def run_gridsearch(self, cv, cv_score: str) -> None: """ Performs a gridsearch over the tuning hyperparameters. Determines the best hyperparameters based on the average validation performance calculated over cross-validation folds. :param cv: A cross-validarion generator that determines the cross-validation strategy. :param cv_score: Measure to evaluate predictions on the validation set. """ cats = self.X_tr.columns[self.X_tr.dtypes == "category"] cat_features = [ list(self.X_tr).index(cats[i]) for i in range(len(cats)) ] params = self.fixed_params.copy() best_AUC = 0.5 for tune in ParameterGrid(self.tuning_params): params.update(tune) AUC_val = [] for train, val in cv.split(self.X_tr, self.y_tr): X_train, y_train = self.X_tr.iloc[train], self.y_tr.iloc[train] X_val, y_val = self.X_tr.iloc[val], self.y_tr.iloc[val] train_pool = cat.Pool(X_train, y_train, cat_features=cat_features) validate_pool = cat.Pool(X_val, y_val, cat_features=cat_features) model = cat.CatBoostClassifier(**params) model.fit(train_pool, eval_set=validate_pool, logging_level="Silent") validation_AUC = calc_perf_score( data=X_val, labels=np.array(y_val.astype("float")), model=model, model_name=self.name, score_name=cv_score, ) AUC_val.append(validation_AUC) AUC_val = np.mean(AUC_val) if AUC_val > best_AUC: best_AUC = AUC_val self.best_tuning_params = tune
def train_model(xtrain, xval, cat_fts, params): y_trn = xtrain['target'].values y_val = xval['target'].values del xtrain['target'], xval['target'] categorical_ind = [k for k, v in enumerate(xtrain.columns) if v in cat_fts] # train model clf = cat.CatBoostClassifier(**params) clf.fit(xtrain.values, y_trn, cat_features=categorical_ind, eval_set=(xval.values, y_val), early_stopping_rounds=100, verbose=100, plot=False) print('Done!') print('Grab feature importance for both train and val') # get feature importance trn_imp = clf.get_feature_importance(data=cat.Pool( data=xtrain, cat_features=categorical_ind), prettified=True) val_imp = clf.get_feature_importance(data=cat.Pool( data=xval, cat_features=categorical_ind), prettified=True) plot_imp(trn_imp, 'train') plot_imp(val_imp, 'val') print('Done feature imp') # make prediction on validation set val_pred = clf.predict_proba(xval.values)[:, 1] logloss_i = log_loss(y_val, val_pred) # compute roc auc fpr, tpr, thresholds = roc_curve(y_val, val_pred, pos_label=1) auc_i = auc(fpr, tpr) # compute map map_i = average_precision_score(y_val, val_pred) print('logloss={0:.4f} | map={1:.4f} | auc={2:.4f}'.format( logloss_i, map_i, auc_i)) # mrr print('reciproical rank for validation set') xval['pred'] = val_pred xval['target'] = y_val val_rr = xval.groupby(level=0).apply(reciprocal_rank) mrr = (1 / val_rr[val_rr != 0]).mean() print(f'Mean reciporical rank on validation set: {mrr:.4f}') return clf, categorical_ind, mrr
def catboosttrainer(X, y, features, initparam, modelname, modelpath, docpath, cvfold=5): print("searching for optimal iteration count...") trainpool = cat.Pool(X[features], y) cvresult = cat.cv(params=initparam, fold_count=cvfold, pool=trainpool, stratified=True) initparam['iterations'] = (len(cvresult)) - (initparam['od_wait'] + 1) del initparam['od_wait'] del initparam['od_type'] print("optimal iteration count is ", initparam['iterations']) print("fitting model...") clf = cat.CatBoostClassifier(**initparam) clf.fit(trainpool) imp = clf.get_feature_importance(trainpool, fstr_type='FeatureImportance') dfimp = pd.DataFrame(imp, columns=['CatBoostImportance']) dfimp.insert(0, column='Feature', value=features) dfimp = dfimp.sort_values(['CatBoostImportance', 'Feature'], ascending=False) xlsxpath = os.path.join(docpath, modelname + ".xlsx") dfimp.to_excel(xlsxpath) print("pickling model...") picklepath = os.path.join(modelpath, modelname) with open(picklepath, 'wb') as fout: pickle.dump(clf, fout) return cvresult, clf, initparam, dfimp
def classification_model(raw_data_file, metric_col, categorical_col, target_col, test_perc, hyperopt_iterations, const_params, use_predefined_params, k_fold, tuning_metric): # preprocess data print('preprocess data:') data_obj = Preproc(raw_data_file, metric_col, categorical_col, target_col, test_perc) # hyperparameter tuning train with best params print('hyperparams tuning and model fitting:') model, params = train_best_model(data_obj.X_train, data_obj.y_train, const_params, hyperopt_iterations, k_fold, tuning_metric, use_predefined_params) print('best params are {}'.format(params), file=sys.stdout) # evaluate model auc = eval_model(data_obj.X_test, data_obj.y_test, model) # save model model.save_model(save_model_dir, format="json", pool=cb.Pool(data_obj.X_train, data_obj.y_train, cat_features=np.where( data_obj.X_train.dtypes == object)[0]))
def train_best_model(X, y, const_params, max_evals=10, use_default=False): # convert pandas.DataFrame to catboost.Pool to avoid converting it on each # iteration of hyper-parameters optimization dataset = cb.Pool(X, y) if use_default: # pretrained optimal parameters best = { 'depth': 3, 'fold_len_multiplier': 41.1, 'iterations': 50, 'learning_rate': 0.1 } else: best = find_best_hyper_params(dataset, const_params, max_evals=max_evals) # merge subset of hyper-parameters provided by hyperopt with hyper-parameters # provided by the user hyper_params = best.copy() hyper_params.update(const_params) # drop `use_best_model` because we are going to use entire dataset for # training of the final model hyper_params.pop('use_best_model', None) model = cb.CatBoostClassifier(**hyper_params) model.fit(dataset, verbose=False) return model, hyper_params
def predict(self, model, test_set): """ xxx """ ds_test = ctb.Pool(test_set) # Make predictions using the best training round return model.predict(data=ds_test, prediction_type='Probability')[:, 1]
def test_catboost_numerical_validation(): ds = vaex.ml.datasets.load_iris() features = ['sepal_width', 'petal_length', 'sepal_length', 'petal_width'] # Vanilla catboost dtrain = cb.Pool(ds[features].values, label=ds.data.class_) cb_bst = cb.train(params=params_multiclass, dtrain=dtrain, num_boost_round=3) cb_pred = cb_bst.predict(dtrain, prediction_type='Probability') # catboost through vaex booster = vaex.ml.catboost.CatBoostModel(features=features, params=params_multiclass, num_boost_round=3) booster.fit(ds, ds.class_) vaex_pred = booster.predict(ds) # Comparing the the predictions of catboost vs vaex.ml np.testing.assert_equal( vaex_pred, cb_pred, verbose=True, err_msg= 'The predictions of vaex.ml.catboost do not match those of pure catboost' )
def simple_on_dataframe(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv([(0.1, 0.2, 0.11, 0.12), (0.97, 0.82, 0.33, 1.1), (0.13, 0.22, 0.23, 2.1), (0.14, 0.18, 0.1, 0.0), (0.9, 0.67, 0.17, -1.0), (0.66, 0.1, 0.31, 0.62)], learn_set_path) with open(cd_path, 'w') as cd: cd.write('3\tTarget') model = utils.run_dist_train([ '--iterations', '20', '--loss-function', 'RMSE', '--learn-set', learn_set_path, '--cd', cd_path ], model_class=cb.CatBoostRegressor) train_pool = cb.Pool(learn_set_path, column_description=cd_path) result = {'prediction': model.predict(train_pool).tolist()} json.dump(result, fp=open( os.path.join(OUTPUT_DIR, 'regression_simple_on_dataframe.json'), 'w'), allow_nan=True, indent=2) finally: os.remove(learn_set_path) os.remove(cd_path)
def main(): if len(sys.argv) != 4: sys.stderr.write("Arguments error. Usage:\n") sys.stderr.write("\t python src/train.py data features model\n") sys.exit(1) with open(os.path.join(sys.argv[2], "cat_features.bin"), "rb") as fd: features = pickle.load(fd) df = pandas.read_csv(os.path.join(sys.argv[1], "train.csv")) data = catboost.Pool(df.drop(["price_usd"], 1), label=df.price_usd, cat_features=features) default_params = {"n_estimators": 4500, "learning_rate": 0.1} params = yaml.safe_load(open("params.yaml"))["train"] cb_params = params["regression"] cb_params.update(default_params) model = catboost.CatBoostRegressor(custom_metric=["R2", "RMSE"], task_type="GPU", random_seed=params["seed"], **cb_params) model.fit(data, verbose=True) model.save_model(sys.argv[3])
def predict(self, data): test_X = data[self.features_name] cat_test = cat.Pool(data=test_X, feature_names=self.features_name, cat_features=self.categorical_feature) prediction = self.estimator.predict(cat_test) return prediction
def main(): model = cb.CatBoostRegressor() # The data was mined by data_miner.py in Data prepocessing/Scripts data = pd.read_csv("data.csv", sep=";", encoding="utf8") data = data.drop(["coors"], axis=1) classes = {} # Some classes need to be mined from cian.ru for _class in ["Конструктив и состояние", "Положительное соседство", "Отрицательное соседство", "Квартиры и планировки", "Инфраструктура", "Безопасность", "Транспорт", "Экология", "price_per_m"]: if _class in data.columns: classes[_class] = data[_class] data = data.drop([_class], axis=1) # Train models for each class for _class in classes: x_train, x_test_val, y_train, y_test_val = train_test_split(data, classes[_class], test_size=0.2, random_state=7) x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=0.7) model.fit(x_train, y_train, use_best_model=True, eval_set=cb.Pool(x_val, y_val), logging_level="Verbose", # 'Silent', 'Verbose', 'Info', 'Debug' early_stopping_rounds=1, save_snapshot=True, snapshot_file="backup.cbsnapshot", snapshot_interval=300, ) print(model.score(x_test, y_test)) model.save_model("trained_model", format="cbm")
def get_train_set(self, as_xgb_dmatrix=False, as_lgb_dataset=False, as_cgb_pool=False): df = self._get_cleaned_single_set(dataset="train") train_cols = df.columns.tolist() train_cols.remove("target") if self.drop_lowimp_features: print('Dropping low importance features !') dropcols = set(df.columns.tolist()).intersection( set(LOW_IMPORTANCE_FEATURES)) df = df.drop(columns=list(dropcols)) if as_xgb_dmatrix: return xgb.DMatrix(data=df[train_cols], label=df[["target"]]) elif as_lgb_dataset: return lgb.Dataset(df[train_cols], df[["target"]].values.ravel()) elif as_cgb_pool: with Timer('Creating Pool for Train set CatBoost'): df, catboost_features = self._generate_catboost_df(df) idx_cat_features = list(range(len(catboost_features))) pool = cgb.Pool( df.drop(columns=["target"]), df["target"], idx_cat_features) return pool else: return df[train_cols], df[["target"]]
def partial_dependence_curve(tickers: Tuple[str, ...], date: pd.Timestamp): """Рисует кривые частичной зависимости для численных параметров. :param tickers: Тикеры, для которых необходимо составить ML-модель. :param date: Дата, на которую составляется ML-модель. """ params = config.ML_PARAMS cases = examples.Examples(tickers, date, params["data"]) clf, train_pool_params = train_clf(cases, params) n_plots = len(train_pool_params["data"].columns) - len( cases.categorical_features()) axs = axs_iter(n_plots) results = [] for n, name in enumerate(train_pool_params["data"]): if n in cases.categorical_features(): continue ax = next(axs) pool_params = copy.deepcopy(train_pool_params) quantiles = pool_params["data"].iloc[:, n].quantile(QUANTILE).values y = [] for quantile in quantiles: pool_params["data"].iloc[:, n] = quantile predict_pool = catboost.Pool(**pool_params) raw_prediction = clf.predict(predict_pool) prediction = (raw_prediction * pool_params["data"].iloc[:, 0] * YEAR_IN_TRADING_DAYS) y.append(prediction.values.mean()) ax.set_title(f"{name}") ax.tick_params(labelleft=True) ax.plot(quantiles, y) results.append((quantiles, y)) plt.show() return results