class CatBoost(BaseModel): ''' Wrapper class of LightGBM. self.core contains Booster. ''' @timer def __init__(self, config): self.config = config @timer def train(self, X_train, y_train, X_val=None, y_val=None, params=None, num_boost_round=100, early_stopping_rounds=None, fold=0): self.core = CatBoostClassifier( # **self.config.params, **params, num_boost_round=num_boost_round) self.core.fit( X=X_train, y=y_train, eval_set=(X_val, y_val), # verbose=True, early_stopping_rounds=early_stopping_rounds) return self @timer def predict(self, X_test): y_test = self.core.predict_proba(X_test)[:, 1] return y_test @property def feature_importance(self): return self.core.get_feature_importance() @property def best_iteration(self): return self.core.get_best_iteration() @property def evals_result(self): return self.core.get_evals_result()
kappa_test = cohen_kappa_score(y_test, pred) pred = model.predict(data=train_pool, prediction_type='Class') acc_train = accuracy_score(y_train, pred) kappa_train = accuracy_score(y_train, pred) d = pd.DataFrame(data={ 'Accuracy': [acc_train, acc_test], 'Kappa': [kappa_train, kappa_test] }) df1 = d.rename(index={0: 'train', 1: 'test'}) #prob= model.predict_proba(data=test_pool) pp = model.get_evals_result() dl = pd.DataFrame(data=pp['learn']) dv = pd.DataFrame(data=pp['validation_0']) result = pd.concat([dl, dv], axis=1, sort=False) result.columns = ['Acc_learn', 'Multi_learn', 'Acc_val', 'Multi_val'] result.to_csv(path_or_buf=args.outfile, index=False) df1.to_csv(path_or_buf=args.ooutfile, index=False)
def train_1fold(self, fold, params, params_custom): X_train, X_valid, y_train, y_valid, X_test, vdx, tdx = self.get_fold_data(fold) cat_feature_idx = [] for i, c in enumerate(X_train): if not is_numeric_dtype(X_train[c]): cat_feature_idx.append(i) if fold == 0: X_train.dtypes.to_csv(self.models_path + "/dtypes.csv") logger.info(f"X_train.shape = {X_train.shape}") params2 = copy.deepcopy(params) if params2["random_seed"] is not None: params2["random_seed"] = params2["random_seed"] + fold logger.info(f"Set catboost train random_seed = {params2['random_seed']}") model = CatBoostClassifier(**params2) model.fit( X_train, y_train, cat_features=cat_feature_idx, eval_set=(X_valid, y_valid) ) model.save_model(self.models_path + f'/model-catboost-f{fold:02d}.bin') util.dump_json(model.get_all_params(), self.models_path + "/params.json") evals = model.get_evals_result() evals_df = pd.DataFrame({ f"logloss_train_f{fold:02d}":evals["learn"]['Logloss'], f"accuracy_train_f{fold:02d}":evals["learn"]['Accuracy'], f"logloss_valid_f{fold:02d}":evals['validation']['Logloss'], f"accuracy_valid_f{fold:02d}":evals['validation']['Accuracy'] }) self.evals_df.append(evals_df) preds_valid = model.predict_proba(X_valid)[:,1] logger.info(f"len(vdx)={len(vdx)} len(preds_valid)={len(preds_valid)}") self.preds_valid_all.loc[vdx, "pred"] = preds_valid preds_train = model.predict_proba(X_train)[:,1] self.preds_train_all.append(pd.DataFrame({fold:preds_train}, index=tdx)) preds_test = model.predict_proba(X_test)[:,1] self.preds_test_all.append(preds_test) acc_valid = accuracy_score(y_valid, np.round(preds_valid)) acc_train = accuracy_score(y_train, np.round(preds_train)) logloss_valid = log_loss(y_valid, preds_valid) logloss_train = log_loss(y_train, preds_train) ms = [fold, acc_train, acc_valid, logloss_train, logloss_valid, model.get_best_iteration()] self.mets.append(ms) show_mets(*ms) for it in ["FeatureImportance"]: imp = pd.Series(model.get_feature_importance(type=it), index=X_train.columns) imp.name = fold imp.index.name = "feature" self.importance[it].append(imp)
use_best_model=True, verbose = True) elapsed_time_training = time.time() - start_time # Predicting print('Predicting...') start_time = time.time() y_pred = model.predict(dataset.X_test, prediction_type='Class') elapsed_time_testing = time.time() - start_time # Analytics print('Analyzing...') title = "CatBoost (weights smote)" eval_results = { 'MultiClass': np.absolute(model.get_evals_result()['validation_0']['MultiClass']), 'Accuracy': np.absolute(model.get_evals_result()['validation_0']['Accuracy']), #'F1': np.absolute(model.get_evals_result()['validation_0']['TotalF1']), #'gmean': model.get_evals_result()['validation_0']['GeometricMean'] } save_path = "C:/Users/thoma/source/repos/PythonMachineLearning/PythonMachineLearning/Library/Results" evaluator = Evaluator(title, save_path) evaluator.append_to_file(f'Best iteration: {model.get_best_iteration()}', "info.txt") evaluator.append_to_file(f'Training time (seconds): {elapsed_time_training}', "info.txt") evaluator.append_to_file(f'Testing time (seconds): {elapsed_time_testing}', "info.txt") evaluator.save_dict_to_file(dataset_parameters, "dataset_parameters.csv") evaluator.save_dict_to_file(model_parameters, "model_parameters.csv") evaluator.save_advanced_metrics(dataset.y_test, y_pred, dataset.class_labels, dataset.class_descriptions) evaluator.save_eval_scores_to_file(eval_results, "metric_results.csv") evaluator.create_evaluation_metric_results(eval_results, xlabel='number of trees', ylabel='metric score')