def test_verbose_int(verbose): expected_line_count = {5: 3, False: 0, True: 10} pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == expected_line_count[verbose]) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=verbose) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == expected_line_count[verbose]) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_cv_query(): pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE) results = cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "QueryRMSE"}) assert "train-QueryRMSE-mean" in results prev_value = results["train-QueryRMSE-mean"][0] for value in results["train-QueryRMSE-mean"][1:]: assert value < prev_value prev_value = value
def test_cv_pairs(): pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE, pairs=QUERYWISE_TRAIN_PAIRS_FILE) results = cv(pool, {"iterations": 5, "random_seed": 8, "loss_function": "PairLogit"}) assert "train-PairLogit-mean" in results prev_value = results["train-PairLogit-mean"][0] for value in results["train-PairLogit-mean"][1:]: assert value < prev_value prev_value = value
def test_cv(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) results = cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}) assert "train-Logloss-mean" in results prev_value = results["train-Logloss-mean"][0] for value in results["train-Logloss-mean"][1:]: assert value < prev_value prev_value = value
def test_cv(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) results = cv({"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}, pool) assert isinstance(results, dict) assert "Logloss_train_avg" in results prev_value = results["Logloss_train_avg"][0] for value in results["Logloss_train_avg"][1:]: assert value < prev_value prev_value = value
def test_cv_pairs(): pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE, pairs=QUERYWISE_TRAIN_PAIRS_FILE) results = cv(pool, { "iterations": 5, "random_seed": 8, "loss_function": "PairLogit" }) assert "train-PairLogit-mean" in results prev_value = results["train-PairLogit-mean"][0] for value in results["train-PairLogit-mean"][1:]: assert value < prev_value prev_value = value
def fit(self, X_train, y_train): bst = cv( Pool(X_train, y_train), self.params ) best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1 print('Best Iteration: {}'.format(best_rounds)) self.params['iterations'] = best_rounds self.model = CatBoostClassifier(**self.params) self.model.fit( X_train, y_train )
def cv_model(params: dict, positions: tuple, date: pd.Timestamp, data_pool_func): """Кросс-валидирует модель по RMSE, нормированному на СКО набора данных Осуществляется проверка, что не достигнут максимум итераций, возвращается RMSE, R2 и параметры модели с оптимальным количеством итераций в формате целевой функции hyperopt Parameters ---------- params Словарь с параметрами модели: ключ 'data' - параметры данных, ключ 'model' - параметры модели positions Кортеж тикеров, для которых необходимо осуществить кросс-валидацию date Дата, для которой необходимо осуществить кросс-валидацию data_pool_func Функция для получения catboost.Pool с данными Returns ------- dict Словарь с результатом в формате hyperopt: ключ 'loss' - нормированная RMSE на кросс-валидации (для hyperopt), ключ 'status' - успешного прохождения (для hyperopt), ключ 'std' - RMSE на кросс-валидации, ключ 'r2' - 1- нормированная RMSE на кросс-валидации в квадрате, ключ 'data' - параметры данных, ключ 'model' - параметры модели, в которые добавлено оптимальное количество итераций градиентного бустинга на кросс-валидации и общие настройки """ data_params = params["data"] data = data_pool_func(positions, date, **data_params) pool_std = np.array(data.get_label()).std() model_params = make_model_params(params) scores = catboost.cv(pool=data, params=model_params, fold_count=FOLDS_COUNT) if len(scores) == MAX_ITERATIONS: raise ValueError( f"Необходимо увеличить MAX_ITERATIONS = {MAX_ITERATIONS}") index = scores["test-RMSE-mean"].idxmin() model_params["iterations"] = index + 1 return dict( loss=scores.loc[index, "test-RMSE-mean"] / pool_std, status=hyperopt.STATUS_OK, std=scores.loc[index, "test-RMSE-mean"], r2=1 - (scores.loc[index, "test-RMSE-mean"] / pool_std)**2, data=data_params, model=model_params, )
def objective(space): global best_score, trials_count # if os.path.isdir('./catboost_info'): # shutil.rmtree('./catboost_info', ignore_errors=True) trials_count += 1 if (trials_count % 5) == 0 and is_quit_pressed(): raise co.TennisAbortError args_dct = dict(**space) params = { "eval_metric": metric_name, # 'eval_metric': 'Logloss', "random_seed": random_state, "logging_level": "Silent", } params.update(args_dct) if how == "cv": cv_data = cv(pools.train, params, stratified=True) scr_val = np.max(cv_data[f"test-{metric_name}-mean"]) elif how == "sklearn": mdl = CatBoostClassifier(**params) mdl.fit(pools.train) pred = mdl.predict_proba(pools.eval)[:, 1] scr_val = roc_auc_score(pools.eval.y, pred) elif how == "native": mdl = CatBoost(params) mdl.fit( pools.train, eval_set=None, # pools.eval if pools.eval else None, silent=True, ) # eval_set=pools.eval pred = mdl.predict(pools.eval, prediction_type="Probability")[:, 1] scr_val = roc_auc_score(pools.eval.get_label(), pred) else: raise Exception("bad how arg {}".format(how)) # pred = mdl.predict(data.X_test) # scr_val = precision_score(data.y_test, pred) if scr_val > best_score: if how == "cv": cco.out("achieved best {} at {}".format(scr_val, params)) else: cco.out("achieved best {} at {} lrate: {} ntrees: {}".format( scr_val, mdl.get_params(), mdl.learning_rate_, mdl.tree_count_)) best_score = scr_val return {"loss": 1.0 - scr_val, "status": STATUS_OK}
def evaluate_model(self): validation_scores = catboost.cv( catboost.Pool(self.X_train, self.y_train, cat_features=self.categorical_columns_indices), self.model.get_params(), nfold=self.n_fold, stratified=self.is_stratified, seed=self.seed, early_stopping_rounds=self.early_stopping_rounds, shuffle=self.is_shuffle, # metrics= 'RMSE', plot=False) self.scores = validation_scores test_scores = validation_scores.iloc[:, 2] best_metric = test_scores.min() return best_metric
def hyperopt_objective(params): model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], iterations=1000, eval_metric='F1', random_seed=42, verbose=False, loss_function='Logloss', ) cv_data = cv( Pool(data, data_label, cat_features=categorical_features_indices), model.get_params()) best_f1 = np.max(cv_data['test-F1-mean']) return 1 - best_f1
def cv_catboost(data): data = pd.DataFrame(data) X_tr = data.drop(3, axis=1) y_tr = data[3] params = { "iterations": 200, "depth": 2, "loss_function": "RMSE", "verbose": True } cv_dataset = Pool(data=X_tr, label=y_tr) scores = cv(cv_dataset, params, fold_count=5, plot="True") return scores
def eval_train(self): cv_params = self.model.get_params() cv_params.update({'loss_function': 'Logloss'}) cv_data = cv( Pool(self.X, self.y), cv_params, #plot=True ) print( 'Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format( np.max(cv_data['test-Accuracy-mean']), cv_data['test-Accuracy-std'][np.argmax( cv_data['test-Accuracy-mean'])], np.argmax(cv_data['test-Accuracy-mean']))) print('Precise validation accuracy score: {}'.format( np.max(cv_data['test-Accuracy-mean'])))
def _model_cross_validation(self): self.cv_data = cv(Pool(self.X, self.y, cat_features=self.categorical_features_indices), self.model.get_params(), plot=False) # Now we have values of our loss functions at each boosting step averaged by 10 folds, # which should provide us with a more accurate estimation of our model performance: print( 'Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format( np.max(self.cv_data['test-Accuracy-mean']), self.cv_data['test-Accuracy-std'][np.argmax( self.cv_data['test-Accuracy-mean'])], np.argmax(self.cv_data['test-Accuracy-mean']))) print('Precise validation accuracy score: {}'.format( np.max(self.cv_data['test-Accuracy-mean'])))
def objective(space_params): #cast integer params from float to int for param in integer_params: space_params[param] = int(space_params[param]) #extract nested conditional parameters if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian': bagging_temp = space_params['bootstrap_type'].get( 'bagging_temperature') space_params['bagging_temperature'] = bagging_temp if space_params['grow_policy']['grow_policy'] == 'LossGuide': max_leaves = space_params['grow_policy'].get('max_leaves') space_params['max_leaves'] = int(max_leaves) space_params['bootstrap_type'] = space_params['bootstrap_type'][ 'bootstrap_type'] space_params['grow_policy'] = space_params['grow_policy'][ 'grow_policy'] #random_strength cannot be < 0 space_params['random_strength'] = max( space_params['random_strength'], 0) #fold_len_multiplier cannot be < 1 space_params['fold_len_multiplier'] = max( space_params['fold_len_multiplier'], 1) #for classification set stratified=True cv_results = cb.cv(train, space_params, fold_count=N_FOLDS, early_stopping_rounds=25, stratified=False, partition_random_seed=42) best_loss = cv_results['test-MAE-mean'].iloc[ -1] #'test-RMSE-mean' for RMSE #for classification, comment out the line above and uncomment the line below: #best_loss = cv_results['test-Logloss-mean'].iloc[-1] #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean' return {'loss': best_loss, 'status': STATUS_OK}
def CAT(data, label): pool = cat.Pool(data, label, has_header=False) params = { "loss_function": 'MultiClassOneVsAll', "eval_metric": 'MultiClassOneVsAll', "max_depth": 7, "learning_rate": 0.2, "classes_count": num_class, "task_type": 'CPU', "thread_count": 6, "verbose_eval": False} before = datetime.datetime.now() results = cat.cv(pool=pool, params=params, num_boost_round=boost_rounds, fold_count=cv_fold, shuffle=True, stratified=True, verbose=False) after = datetime.datetime.now() print("CatBoost") print("najlepsi priemer: " + str(1 - min(results['test-MultiClassOneVsAll-mean']))) print("index najlepsieho: " + str(results['test-MultiClassOneVsAll-mean'][results['test-MultiClassOneVsAll-mean'] == min( results['test-MultiClassOneVsAll-mean'])].index[0])) print("najhorsi priemer: " + str(1 - max(results['test-MultiClassOneVsAll-mean']))) print("finalny priemer: " + str(1 - results['test-MultiClassOneVsAll-mean'].iloc[-1])) print("cas: " + str(after - before)) print('\n')
def cross_validation( clf, pool: Pool, metric_name, fold_count=5, stratified=True, early_stopping_rounds=None, plot=False, ): """metric_name: 'AUC', 'Precision', 'Accuracy'... Output as sample: Best test Precision score: 0.72+-0.25 on step 22 Best test AUC score: 0.57+-0.01 on step 23 Best validation Logloss score: 0.69+-0.00 on step 0 """ assert isinstance(pool, Pool) print(f"cross_validation start with metric {metric_name} " f"fold_count {fold_count} stratified {stratified} " f"early_stopping {early_stopping_rounds}") cv_params = clf.get_params() cv_params.update({"loss_function": "Logloss"}) cv_data = cv( pool, cv_params, fold_count=fold_count, stratified=stratified, plot=plot, early_stopping_rounds=early_stopping_rounds, ) test_metric_pref = "test-" + metric_name print("Best test {} score: {:.2f}+-{:.2f} on step {}".format( metric_name, np.max(cv_data[test_metric_pref + "-mean"]), cv_data[test_metric_pref + "-std"][np.argmax(cv_data[test_metric_pref + "-mean"])], np.argmax(cv_data[test_metric_pref + "-mean"]), )) print("Best validation Logloss score: {:.2f}+-{:.2f} on step {}".format( np.max(cv_data["test-Logloss-mean"]), cv_data["test-Logloss-std"][np.argmax(cv_data["test-Logloss-mean"])], np.argmax(cv_data["test-Logloss-mean"]), ))
def make_catboost(conf: dict, df, log_path): X_new = univariate_feature_selection(df['X'], df['y'], conf['feature_selection']['k_best_features']) logger.info('{} is running. X shape = {}'.format('->'.join(log_path), X_new.shape)) cv_dataset = Pool(data=X_new, label=df['y']) params = {"loss_function": "Logloss", "early_stopping_rounds": 30, "verbose": True, "custom_metric": ["Accuracy", "F1", "Recall", "Precision"], "eval_metric": 'F1'} scores = cv(cv_dataset, params, fold_count=5, verbose=False) logger.info('{} is done. output = {}'.format('->'.join(log_path), scores['test-F1-mean'].max())) # clf.fit(X_new, y) return scores['test-F1-mean'].max()
def train_all_save_catboost(self, X, y, categorical_features_indices): """train whole data and save the training to be use later in new predictions""" model = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1', random_seed=42, leaf_estimation_method='Newton') cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params()) print("precise validation accuracy score:{}".format(np.max(cv_data))) model.fit(X, y, cat_features=categorical_features_indices) #feature importance print(model.get_feature_importance(prettified=True)) # train = Pool(X, y, cat_features=categorical_features_indices) # feature_importances = model.get_feature_importance(train) # feature_names = X.columns # for score, name in sorted(zip(feature_importances, feature_names), reverse=True): # print('{}: {}'.format(name, score)) model.save_model('catboost_model.dump') print("Catboost model has been saved!")
def train_test_dataset(X, y, cat_features=None, auto_class_weights=None, loss_function='Logloss', iterations=2000, metrics=['AUC']): pool = Pool(data=X, label=y, cat_features=cat_features) params = { 'task_type': 'CPU', 'auto_class_weights': auto_class_weights, 'custom_metric': metrics, 'verbose': False, 'loss_function': loss_function, 'iterations': iterations } return cv( pool, params, fold_count=5, plot=True, logging_level='Info' )
def model_cv(self, model, folds): '''Run model cross-validation''' cv_params = model.get_params() print('{}\nStart model crossvalidation proccess'.format(split_line)) cv_data = cv( self.pool, cv_params, fold_count = folds, #iterations = 800, verbose = 200, early_stopping_rounds = 20 ) print('Best validation accuracy score: {:.4f}±{:.4f} on step {}'\ .format( np.min(cv_data['test-RMSE-mean']), cv_data['test-RMSE-std'][np.argmax(cv_data['test-RMSE-mean'])], np.argmax(cv_data['test-RMSE-mean']) ) )
def cv(self, params_model=None, nfold=5, num_boost_round=10000, early_stopping_rounds=100, **kwargs): # If no params_model is given, take self.params_best_fit if params_model is None: params_model = self.params_best_fit dtrain = self.get_train_set(as_cgb_pool=True) eval_hist = cgb.cv(params=params_model, dtrain=dtrain, nfold=nfold, verbose_eval=True, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, **kwargs) return eval_hist
def hyperopt_objective(params): X_train = self.X_train y_train = self.y_train categorical_features_indices = self.cat_index model = CatBoostRegressor( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], depth=params['tree_depth'], #iterations=500, eval_metric='RMSE', #use_best_model=True, random_seed=42, logging_level='Silent') cv_data = cv(params=model.get_params(), pool=Pool(X_train, y_train, cat_features=categorical_features_indices)) #print(cv_data) best_rmse = np.min(cv_data['test-RMSE-mean']) print('params is', params, 'rmse is ', best_rmse) return best_rmse # as hyperopt minimises
def hyperopt_objective(params): model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), #learning_rate=params['learning_rate'], depth=params['depth'], iterations=500, eval_metric='Accuracy', od_type='Iter', od_wait=40, random_seed=42, logging_level='Silent', allow_writing_files=False ) cv_data = cv( train_pool, model.get_params() ) best_accuracy = np.max(cv_data['test-Accuracy-mean']) print(params, best_accuracy) return 1 - best_accuracy # as hyperopt minimises
def ctb_crossval(self, params, optim_type): '''catboost cross validation model Paramters --------- params: Hyper parameters in dict type from different optimization methods optim_type: choose among Optuna, Hyperopt, RandomSearch Returns ------ Loss, params, n_estimator, run_time''' # initializing the timer start = timer() print('trial using : ', params) cv_results = cb.cv(self.train_set, params, fold_count=N_FOLDS, num_boost_round=NUM_BOOST_ROUNDS, early_stopping_rounds=EARLY_STOPPING_ROUNDS, stratified=True, partition_random_seed=SEED, verbose_eval=True, plot=False) # store the runtime run_time = timer() - start # Extract the best score best_score = np.max(cv_results['test-F1-mean']) # Loss must be minimized loss = 1 - best_score # Boosting rounds that returned the highest cv score n_estimators = int(np.argmax(cv_results['test-F1-mean']) + 1) if loss < self.loss: self.estimator = n_estimators self.loss = loss #print(params) return loss, params, n_estimators, run_time
def cross_val(self, nfold=3, shuffle=True, stratified=None, plot=True, partition_random_seed: int = 14): """ :param nfold: :param shuffle: :param stratified: :param plot: :param partition_random_seed: :return: cv results : pandas.core.frame.DataFrame with cross-validation results columns are: test-error-mean test-error-std train-error-mean train-error-std """ from catboost import Pool, cv import numpy as np features, labels, cat_cols = self._data_processor.cv_input_fn() cv_data = Pool(data=features, label=labels, cat_features=cat_cols) cv_result = cv(cv_data, self._params, nfold=nfold, shuffle=shuffle, stratified=stratified, plot=plot, partition_random_seed=partition_random_seed) print('Best validation {} score: {:.2f}±{:.2f} on step {}'.format( self._params['custom_metric'], np.max(cv_result[f'test-{self._params["custom_metric"]}-mean']), cv_result[f'test-{self._params["custom_metric"]}-std'][np.argmax( cv_result[f'test-{self._params["custom_metric"]}-mean'])], np.argmax( cv_result[f'test-{self._params["custom_metric"]}-mean']))) print('Precise validation {} score: {}'.format( self._params['custom_metric'], np.max(cv_result[f'test-{self._params["custom_metric"]}-mean']))) return cv_result
def catboosttrainer(X,y,features,initparam,modelname,modelpath,docpath,cvfold = 5): print ("searching for optimal iteration count...") trainpool = cat.Pool(X[features],y) cvresult = cat.cv(params= initparam, fold_count=cvfold, pool=trainpool,stratified = True) initparam['iterations'] = (len(cvresult)) - (initparam['od_wait']+1) del initparam['od_wait'] del initparam['od_type'] print ("optimal iteration count is ", initparam['iterations']) print ("fitting model...") clf = cat.CatBoostClassifier(** initparam) clf.fit(trainpool) imp = clf.get_feature_importance(trainpool,fstr_type='FeatureImportance') dfimp = pd.DataFrame(imp,columns = ['CatBoostImportance']) dfimp.insert(0,column='Feature', value=features) dfimp = dfimp.sort_values(['CatBoostImportance','Feature'], ascending= False) xlsxpath = os.path.join(docpath,modelname+".xlsx") dfimp.to_excel(xlsxpath) print ("pickling model...") picklepath = os.path.join(modelpath,modelname) with open(picklepath,'wb') as fout: pickle.dump(clf, fout) return cvresult,clf,initparam,dfimp
def objetive(trial): params.update({ "boosting_type": trial.suggest_categorical("boosting_type", ['Ordered', 'Plain']), "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.1), "max_depth": trial.suggest_int("max_depth", 4, 12), "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-4, 1e4), "border_count": trial.suggest_int('border_count', 1, 255), "random_strength": trial.suggest_loguniform("random_strength", 1e-4, 1e4), "bagging_temperature": trial.suggest_loguniform("bagging_temperature", 1e-4, 1e4), }) cv_results = catboost.cv(params=params, pool=d_train, iterations=10000, early_stopping_rounds=50, folds=folds, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[1] score = cv_results[rmetric_name][ -1] # np.min(cv_results[rmetric_name]) print("Num_boost_round: " + str(len(cv_results[rmetric_name]))) if save_study_as is not None: joblib.dump(study, save_study_as) return score
def cgb_fit(config, X_train, y_train): """模型(交叉验证)训练,并返回最优迭代次数和最优的结果。 Args: config: xgb 模型参数 {params, max_round, cv_folds, early_stop_round, seed, save_model_path} X_train:array like, shape = n_sample * n_feature y_train: shape = n_sample * 1 Returns: best_model: 训练好的最优模型 best_auc: float, 在测试集上面的 AUC 值。 best_round: int, 最优迭代次数。 """ params = config.params max_round = config.max_round cv_folds = config.cv_folds seed = config.seed save_model_path = config.save_model_path if cv_folds is not None: dtrain = cgb.Pool(X_train, label=y_train) cv_result = cgb.cv(dtrain, params, num_boost_round=max_round, nfold=cv_folds, seed=seed, logging_level='Verbose') # 最优模型,最优迭代次数 auc_test_avg = cv_result['AUC_test_avg'] best_round = np.argmax(auc_test_avg) best_auc = np.max(auc_test_avg) # 最好的 auc 值 best_model = cgb.train(dtrain, params, num_boost_round=best_round) else: X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=100) dtrain = cgb.Pool(X_train, label=y_train) dvalid = cgb.Pool(X_valid, label=y_valid) best_model = cgb.train(params, dtrain, num_boost_round=max_round, eval_set=dvalid) best_round = best_model.best_iteration best_auc = best_model.best_score cv_result = None if save_model_path: check_path(save_model_path) pickle.dump(best_model, open(save_model_path, 'wb')) return best_model, best_auc, best_round, cv_result
def cbfunc(border_count, l2_leaf_reg, depth, learning_rate): params = { 'eval_metric': 'MAE', # using MAE here, could also be RMSE or MSE 'early_stopping_rounds': esrounds, 'num_boost_round': brounds, 'use_best_model': True, 'task_type': "GPU" } params['border_count'] = round(border_count, 0) params['l2_leaf_reg'] = l2_leaf_reg params['depth'] = round(depth, 0) params['learning_rate'] = learning_rate # Cross validation cv_results = cb.cv(cb.Pool(xtrain, ytrain, cat_features=cat_features), params=params, fold_count=3, inverted=False, partition_random_seed=5, shuffle=True, logging_level='Silent') # bayes_opt MAXIMISES: In order to minimise MAE, I use 1/MAE as target value return 1 / cv_results['test-MAE-mean'].min()
def do_cv(learning_rate, depth, l2_leaf_reg): param = { 'iterations': 3000, 'od_type': 'Iter', 'od_wait': 50, 'learning_rate': learning_rate, 'depth': depth, 'l2_leaf_reg': l2_leaf_reg, 'loss_function': 'MAE', 'eval_metric': 'MAE', } eval_hist = catboost.cv(param, catboost.Pool(train_x, train_y, cat_inds), 5) res_list = [ eval_hist['MAE_test_avg'][-1], eval_hist['MAE_test_stddev'][-1], len(eval_hist['MAE_test_avg']), param['learning_rate'], param['depth'], param['l2_leaf_reg'], ] line = '%.7f,%.7f,%.0f,%.6f,%.0f,%.0f' % tuple(res_list) write_to_file(line) return res_list
def cv(self, data, clf=None): setseed(self.seed) if clf: train_X, train_y = \ data[self.features_name].values,\ data['validRevenue'].values else: train_X, train_y = \ data[self.features_name].values,\ data['totals_transactionRevenue'].values cat_train = cat.Pool(data=train_X, label=train_y, feature_names=self.features_name, cat_features=self.categorical_feature) cat_cv_hist = cat.cv( pool=cat_train, params=self.params, # num_boost_round = self.num_boost_round, nfold=self.nfold, seed=self.seed) return cat_cv_hist
def test_verbose_int(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) tmpfile = 'test_data_dumps' with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=5) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 2) with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=False) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): cv(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss"}, verbose=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 10) log_files = [] for i in range(3): log_files.append(JSON_LOG_PATH[:-5]+str(i)+JSON_LOG_PATH[-5:]) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[0]}, verbose=5) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 2) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[1]}, verbose=False) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 0) with LogStdout(open(tmpfile, 'w')): train(pool, {"iterations": 10, "random_seed": 0, "loss_function": "Logloss", "json_log": log_files[2]}, verbose=True) with open(tmpfile, 'r') as output: assert(sum(1 for line in output) == 10) canonical_files = [] for log_file in log_files: canonical_files.append(local_canonical_file(remove_time_from_json(log_file))) return canonical_files
y_pred = model.predict(test_pool) # 模型评估 print(f'R2: {r2_score(y_test, y_pred)}') # 特征重要性 print(f'feature_importance = {model.get_feature_importance(train_pool)}') # 保存模型 model.save_model('regression.cbm', format="cbm") # Calculate the RMSE metric for the objects in the given dataset. print(f'score: {model.score(X_train, y_train)}') # cv args = config.pop('calc_feature_importance', None) print(config) config1 = { 'pool': train_pool, 'params': config, 'iterations': 1000, 'fold_count': 10, 'partition_random_seed': 120, 'logging_level': 'Verbose', 'stratified': False, 'as_pandas': True } scores = cv(**config1) print(f'CV result is: {scores}')
X_train = train[features].fillna('') y_train = train['Survived'] X_test = test[features].fillna('') model = catboost.CatBoostClassifier(one_hot_max_size=4, iterations=100, random_seed=0, verbose=False, eval_metric='Accuracy') pool = catboost.Pool(X_train, y_train, cat_features=[0, 2]) print( 'To see the Catboost plots, fork this kernel and run it in the editing mode.' ) cv_scores = catboost.cv(pool, model.get_params(), fold_count=10, plot=True) print('CV score: {:.5f}'.format(cv_scores['test-Accuracy-mean'].values[-1])) # You can check yourself the public LB score of this model (0.77990) by submitting the file `submission2.csv` from the section Output of this kernel. # In[ ]: model.fit(pool) pred = model.predict(X_test).astype('int') output = pd.concat( [test['PassengerId'], pd.DataFrame(pred, columns=['Survived'])], axis=1) output.to_csv('submission2.csv', index=False) # Next I consider the generated feature `Boy`. It takes the value 1 if the title in `Name` is "Master" and the value 0 otherwise. #
def test_cv_with_not_binarized_target(): train_file = data_file('adult_not_binarized', 'train_small') cd = data_file('adult_not_binarized', 'train.cd') pool = Pool(train_file, column_description=cd) cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
def test_bad_params_in_cv(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) with pytest.warns(UserWarning): cv({"iterations": 5, "random_seed": 0, "loss_function": "Logloss", "use_best_model": True}, pool)
def test_cv_logging(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))