def custom_metric(): from catboost import CatBoostClassifier, Pool train_data = [[0, 3], [4, 1], [8, 1], [9, 1]] train_labels = [0, 0, 1, 1] eval_data = [[2, 1], [3, 1], [9, 0], [5, 3]] eval_labels = [0, 1, 1, 0] eval_dataset = Pool(eval_data, eval_labels) model = CatBoostClassifier(learning_rate=0.03, custom_metric=['Logloss', 'AUC:hints=skip_train~false']) model.fit(train_data, train_labels, eval_set=eval_dataset, verbose=False) print(model.get_best_score())
def catboost_hyperparams(**dict_): params = param_adjust_dtypes(prior_params, pds_dtypes, dict_) # Model definition model = CatBoostClassifier(**params) # Fitting model.fit(train_set, eval_set=val_set, use_best_model=True) return np.max(model.get_best_score()["validation"]["AUC"])
def hyperopt_ctb_scoreCV_manual(params): global X_hyper global y_hyper global global_best_model for key in catboost_space: print(f' {key} {params[key]}') clf = CatBoostClassifier(**params) skf = StratifiedKFold(n_splits=3) cross_val_result = {'estimator': [], 'test_score': []} for i, (train_ind, val_ind) in enumerate(skf.split(X_hyper, y_hyper)): train_set = Pool(data=X_hyper.loc[train_ind], label=y_hyper[train_ind], cat_features=X_hyper.columns) val_set = Pool(data=X_hyper.loc[val_ind], label=y_hyper[val_ind], cat_features=X_hyper.columns) clf.fit(X=train_set, eval_set=val_set, use_best_model=True) cross_val_result['estimator'].append(clf) cross_val_result['test_score'].append( clf.get_best_score()['validation']['AUC']) # current_score = clf.get_best_score()['validation']['AUC'] current_score = np.mean(cross_val_result['test_score']) if current_score > global_best_model['AUC']: global_best_model['AUC'] = current_score global_best_model['model'] = cross_val_result['estimator'] print(f'new best AUC = {current_score}') result = { 'loss': -current_score, 'status': STATUS_OK, # -- store other results like this 'eval_time': time.time(), 'other_stuff': { 'type': None, 'value': [0, 1, 2] }, # 'model': clf, # -- attachments are handled differently 'attachments': { 'attachments': 'attachments' } } return result
def fit_catboost(self, X, y, X_val, y_val): logging.info('- Fit catboost model') model = CatBoostClassifier(iterations=10000, eval_metric='AUC') model.fit(X, y, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100) best_score = model.get_best_score()['validation']['AUC'] best_iteration = model.get_best_iteration() self.models.append({ 'model': model, 'best_score': best_score, 'best_iteration': best_iteration }) logging.info('Best score = {:.2%}, in {} iterations'.format( best_score, best_iteration))
def fun_catboost(X, y, X_train, X_validation, y_train, y_validation, target): #Creating a training set for modeling and validation set to check model performance #X = df_train.drop(['Segmentation', 'Gender','Ever_Married', 'Work_Experience','Family_Size','Var_1'], axis=1) #categorical_features_indices = np.where(df_train.dtypes != np.float)[0] categorical_features_indices = list(range(len(X_train.columns))) categorical_features_indices #importing library and building model from catboost import CatBoostClassifier model = CatBoostClassifier(iterations=5, depth=3, learning_rate=0.1, loss_function='MultiClass', eval_metric='Accuracy') model.fit(X_train, y_train, eval_set=(X_validation, y_validation), plot=True) predictions = model.predict(df_test) model.get_feature_importance(type="FeatureImportance") from catboost import Pool, CatBoostClassifier from catboost.utils import get_confusion_matrix train_label = ["A", "B", "C", "D"] cm = get_confusion_matrix(model, Pool(X_validation, y_validation)) print(cm) print(model.get_best_score()) submission = pd.DataFrame() submission['ID'] = df_test['ID'] submission[target] = predictions return categorical_features_indices, model, submission, predictions
eval_pool = Pool(X_test, y_test) # load model model = CatBoostClassifier() model.load_model('models/catboost_model_4.dump') # Feature Importance: Know which feature contributed the most feature_importances = model.get_feature_importance(train_pool) feature_names = pd.DataFrame(X_train).columns for score, name in sorted(zip(feature_importances, feature_names), reverse=True): print('{}: {}'.format(name, score)) print('\n\n\n') print(model.get_best_score()) print(model.get_params()) # Validation Prediction probabilities = model.predict(eval_pool) # print(probabilities) pd.DataFrame(probabilities).to_csv('validation-scores/val-scores-3.csv') # TEST VALUES # preped_test_values = np.array(pd.read_csv('preped/preped_test_&_featured.csv')) # eval_dataset = Pool(test_values) # test_prediction = model.predict(preped_test_values)
def search_CatBoost_parameters(config: dict, train_dataset: MusicDataset, val_dataset: MusicDataset = None, internal_cv=False): """ Fit a CatBoostClassifier using train and validation set Returns: - a list of the names of the parameters - a list of tried parameter configurations - a list of corresponding results """ # Get parameters if (type(config[_n_iterations_key]) == list): iterations = np.arange(config[_n_iterations_key][0], config[_n_iterations_key][1], config[_n_iterations_key][2]) else: iterations = config[_n_iterations_key] if (type(config[_learning_rate_key]) == list): learning_rates = np.arange(config[_learning_rate_key][0], config[_learning_rate_key][1], config[_learning_rate_key][2]) else: learning_rates = config[_learning_rate_key] loss_function = config.get("loss_function", "CrossEntropy") parameter_names = [] parameter_sets = [] results = [] # Get data _, X_train, y_train = train_dataset.get_whole_dataset_as_pd() if (val_dataset != None): _, X_val, y_val = val_dataset.get_whole_dataset_as_pd() # GPU if (torch.cuda.is_available()): task_type = 'GPU' devices = str(torch.cuda.current_device()) else: task_type = 'CPU' devices = None if (not internal_cv): # No internal cross validation during training for i_it, it in enumerate(iterations): for i_lr, lr in enumerate(learning_rates): model = CatBoostClassifier(iterations=it, learning_rate=lr, loss_function=loss_function, task_type=task_type, devices=devices, custom_metric=['Accuracy']) model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=10) params = model.get_params() parameter_names = list(params.keys()) parameter_sets.append(list(params.values())) best_score = model.get_best_score() results.append(best_score['validation']['Accuracy']) best_iter = model.get_best_iteration() print("Best iteration: " + str(best_iter)) else: # Use catboost cross validation procedure params = {} params['loss_function'] = loss_function params['iterations'] = iterations params['custom_metric'] = 'Accuracy' params['task_type'] = task_type params['devices'] = devices best_value = 0.0 best_iter = 0 for i_lr, lr in enumerate(learning_rates): params['learning_rate'] = lr cv_data = cv(params=params, pool=Pool(X_train, label=y_train), fold_count=5, shuffle=True, partition_random_seed=0, plot=True, stratified=False, verbose=50) res_value = np.max(cv_data['test-Accuracy-mean']) res_iter = np.argmax(cv_data['test-Accuracy-mean']) params['best_iteration'] = res_iter print( f"Best iteration for lr {lr}: {res_iter} with val accuracy {res_value}" ) results.append(res_value) parameter_sets.append(list(params.values())) parameter_names = list(params.keys()) # Remove entry from dict since it is used as input for cv again params.pop('best_iteration') return parameter_names, parameter_sets, results
train_pool = Pool(X_train, y_train.astype(int), cat_features=categorical_features_step1) validate_pool = Pool(X_val, y_val.astype(int), cat_features=categorical_features_step1) model1.fit( train_pool, eval_set=validate_pool, # logging_level='Verbose', # you can uncomment this for text output plot=True) # - model1.get_best_score() # + # Plot non-normalized confusion matrix np.set_printoptions(precision=2) predict_val = model1.predict(X_val) plot_confusion_matrix(y_val, predict_val, classes=np.array([True, False]), title='Confusion matrix', normalize=False) # - # # Beta calibration #
splits = folds.split(train, train['label']) score_csv = [] for i, (tr_idx, val_idx) in enumerate(splits): X_tr, X_vl = train[features].iloc[tr_idx], train[features].iloc[val_idx] y_tr, y_vl = train['label'].iloc[tr_idx], train['label'].iloc[val_idx] clf = CatBoostClassifier(**cat_params) clf.fit( X_tr, y_tr, cat_features=categorical_features, eval_set=(X_vl, y_vl), verbose=100, plot=True) y_pred_valid = clf.predict_proba(X_vl)[:, 1] cat_oof_train[val_idx] = y_pred_valid.reshape(-1, 1) cat_oof_test += clf.predict_proba(test[features])[:, 1].reshape(-1, 1) / folds.n_splits score_csv.append(clf.get_best_score()['validation']['F1']) del X_tr, X_vl, y_tr, y_vl # Features imp fold_importance_df = pd.DataFrame() fold_importance_df["Feature"] = features fold_importance_df["importance"] = clf.get_feature_importance() fold_importance_df["fold"] = i + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) gc.collect() score = np.mean(score_csv)
''' model = CatBoostClassifier(**model_params, ) # model = CatBoostClassifier(**model_params, verbose=False) model.fit(train_data, eval_set=test_data, verbose=False, plot=False) model.save_model(f"{project_dir}/model/model.cbm") builtin_metrics = model.eval_metrics(train_data, metrics=['Logloss', 'AUC', 'F1', 'PRAUC']) # write results hold_out_score = model.get_best_score() # write_eval_summary_file(cv_scores, hold_out_score) predict_probas = model.predict_proba(test_data) test_data_metrics = calculate_metrics(build_spec['standard_metrics'], build_spec['custom_metrics'], y_test, predict_probas[:, 1]) write_eval_summary_file(cv_scores, test_data_metrics) predict_probas = pd.DataFrame(predict_probas, columns=['False', 'True']) predict_probas.to_csv(f"{project_dir}/model/test_data_prediction.csv") shap_values = model.get_feature_importance(data=test_data,
def gridsearch_early_stopping(cv, X, y, folds, grid, cat_features=None, save=None): ''' Perform grid search with early stopping across folds specified by index Parameters ----------- cv: cross validation X: DataFrame or Numpy array y: DataFrame or Numpy array fold: list of fold indexes grid: parameter grid save: string, excluding file extension (default=None) saves results_df for each fold to folder '../../data/interim' ''' if np.unique(y).size <= 2: loss_function = 'Logloss' else: loss_function = 'MultiClass' # generate data folds train_X, train_y, test_X, test_y = generate_folds(cv, X, y) # iterate through specified folds for fold in folds: # assign train and test pools test_pool = Pool(data=test_X[fold], label=test_y[fold], cat_features=cat_features) train_pool = Pool(data=train_X[fold], label=train_y[fold], cat_features=cat_features) # creating results_df dataframe results_df = pd.DataFrame(columns=[ 'params' + str(fold), loss_function + str(fold), 'Accuracy' + str(fold), 'iteration' + str(fold) ]) best_score = 99999 # iterate through parameter grid for params in ParameterGrid(grid): # create catboost classifer with parameter params model = CatBoostClassifier( cat_features=cat_features, early_stopping_rounds=50, task_type='GPU', custom_loss=['Accuracy'], iterations=3000, #class_weights=weights, **params) # fit model model.fit(train_pool, eval_set=test_pool, verbose=400) # append results to results_df print(model.get_best_score()['validation']) results_df = results_df.append( pd.DataFrame([[ params, model.get_best_score()['validation'][loss_function], model.get_best_score()['validation']['Accuracy'], model.get_best_iteration() ]], columns=[ 'params' + str(fold), loss_function + str(fold), 'Accuracy' + str(fold), 'iteration' + str(fold) ])) # save best score and parameters if model.get_best_score( )['validation'][loss_function] < best_score: best_score = model.get_best_score( )['validation'][loss_function] best_grid = params print("Best logloss: ", best_score) print("Grid:", best_grid) save_file(results_df, save + str(fold) + '.joblib', dirName='../../models') display(results_df)