def CatBoost_gridsearch(x_train, y_train, categorical_indexes): cat = CatBoostClassifier(cat_features=categorical_indexes) tune = cat.grid_search(cat_params, cv=5, stratified=True, shuffle=True, serch_by_train_test_split=True, X=x_train, y=y_train, plot=True) return tune
def grid_search_catboost(pool): """Grid search helper function for Scikit Catboost classifier.""" params = { "verbose": False, "eval_metric": "AUC", "loss_function": "Logloss", } grid = { "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"], "iterations": [1000, 2000], "depth": [3, 4, 5, 6], "min_data_in_leaf": [3, 5, 7, 10], } model = CatBoostClassifier(**params) search_results = model.grid_search(grid, X=pool, verbose=False) return {**search_results["params"], **params}
# 'l2_leaf_reg': np.logspace(-20, -19, 3), # 'leaf_estimation_iterations': [20], # 'eval_metric': ['Accuracy'], # 'use_best_model': ['True'], # 'logging_level':['Silent'], # 'random_seed': [0] # } categorical_indexes = [0,3,5,6] cat = CatBoostClassifier(cat_features=categorical_indexes).fit(x_train, y_train) cross_val_score(cat,x_val, y_val).mean() cat_grid = {'iterations':[150,300,500], 'depth':[3,5,7], 'random_seed':[0], 'learning_rate':[0.005,0.01,0.1,0.2], 'l2_leaf_reg':[3,5,7,9],'leaf_estimation_iterations':[10,30,50]} cat = CatBoostClassifier(cat_features=categorical_indexes) cat_grid = cat.grid_search(cat_grid, cv=5, stratified=True, shuffle=True, serch_by_train_test_split=True, X=x_train, y=y_train, plot=True) # test의 logloss, std 확인하여 iteration=?? 정함 cat_grid['cv_results'].keys() cat_fit = CatBoostClassifier(cat_features=categorical_indexes, leaf_estimation_iterations=50, depth=5, random_seed=0, l2_leaf_reg=7, iterations=300, learning_rate=0.2) cat_fit.fit(x_train, y_train) cat_dict = {} cat_dict['Catboost']= {'time':str(datetime.datetime.now()),'name': 'Catboosting', 'best_param':cat_grid['params'], 'cross_val_score_mean':cross_val_score(cat_fit, x_val, y_val).mean()} # kfold testing
def gradient_boosting_classifier(data, exluded_num_cols, excluded_cat_cols, target, sub_category=None, tune_parameters=False, display_results=True): """ Create a catboost classifier with the given input and display the results :param data: base dataframe :type data: :param exluded_num_cols: numerical columns to include in analysis :type exluded_num_cols: :param excluded_cat_cols: categorical columns to use in analysis :type excluded_cat_cols: :param target: categorical column which to use for classifying :type target: :return: :rtype: """ if not tune_parameters: # Binary classification case when a binary column is picked or a subcategory is set if len(data[target].dropna().unique()) == 2 or sub_category: # Train the model in standard configuration train_pool, test_pool, x_train, y_train, y_test = create_boost_training_data( data, exluded_num_cols, excluded_cat_cols, target, target_cat=sub_category) model = CatBoostClassifier(iterations=40, learning_rate=1, depth=8, loss_function="Logloss", custom_metric=["Logloss", "AUC"]) else: # Train the model in standard configuration train_pool, test_pool, x_train, y_train, y_test = create_boost_training_data( data, exluded_num_cols, excluded_cat_cols, target) model = CatBoostClassifier( iterations=40, learning_rate=1, depth=8, loss_function="MultiClassOneVsAll", custom_metric=["MultiClassOneVsAll", "AUC"]) model.fit(train_pool) else: # Perform cross validated hyper parameter tuning on the training set train_pool, test_pool, x_train, y_train, y_test = create_boost_training_data( data, exluded_num_cols, excluded_cat_cols, target, target_cat=sub_category) # Binary classification case when a binary column is picked or a subcategory is set if len(data[target].dropna().unique()) == 2 or sub_category: model = CatBoostClassifier(loss_function="Logloss", custom_metric=["Logloss", "AUC"]) else: model = CatBoostClassifier( loss_function="MultiClassOneVsAll", custom_metric=["MultiClassOneVsAll", "AUC"]) grid = { "iterations": [40, 60, 100], "learning_rate": [0.01, 0.1, 1], "depth": [4, 6, 10], "l2_leaf_reg": [3, 5, 7] } grid_result = model.grid_search(grid, X=train_pool, plot=False, verbose=True) pred = model.predict(test_pool) if len(data[target].dropna().unique()) == 2: score = accuracy_score(y_test, pred == "True") else: # TODO implement multiclass scoring score = accuracy_score(y_test, np.squeeze(pred)) if display_results: print(score) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(x_train) # print(model.get_best_score()) shap.summary_plot(shap_values, x_train) # print(model.eval_metrics(test_pool, metrics=["AUC"])) # # If binary plot roc curve if len(data[target].dropna().unique()) == 2 or sub_category: get_roc_curve(model, test_pool, plot=True) # otherwise plot confusion matrix else: confusion_matrix = get_confusion_matrix(model, test_pool) plot_confusion_matrix(confusion_matrix) return model, score
# catboost_lon_level1.grid_search(param_grid=catboost_lon_level1_params, X=X, y=y_lon) # %% Latitude optimization grid_search_results = catboost_lat_level1.grid_search( param_grid, X=X, y=y_lat, shuffle=False, verbose=3 ) catboost_lat_level1_params = grid_search_results["params"] print(catboost_lat_level1_params) # catboost_lat_level1_params = {'depth': 4, 'l2_leaf_reg': 1} # catboost_lat_level1.grid_search(param_grid=catboost_lat_level1_params, X=X, y=y_lat) # %% Buildings optimization grid_search_results = catboost_building_level1.grid_search( param_grid, X=X, y=y_building, shuffle=False, verbose=3 ) catboost_building_level1_params = grid_search_results["params"] print(catboost_building_level1_params) # catboost_building_level1_params = {'depth': 10, 'l2_leaf_reg': 1} # catboost_building_level1.grid_search( # param_grid=catboost_building_level1_params, X=X, y=y_building # ) # %% Floors optimization grid_search_results = catboost_floor_level1.grid_search( param_grid, X=X, y=y_floor, shuffle=False, verbose=3 ) catboost_floor_level1_params = grid_search_results["params"] print(catboost_floor_level1_params)
def cvgrid_search(X, y, X_test, model_type): X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.2, random_state=42) if model_type == 'cat': model = CatBoostClassifier(**PARAMS['cat_def']) grid_search_result = model.grid_search(PARAMS['cat_cv'], X=X_train, y=y_train, plot=False, cv=5, stratified=True, verbose=0) best_model = CatBoostClassifier(**PARAMS['cat_def'], **grid_search_result['params']) best_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0) get_preds = lambda model, x: model.predict( x, prediction_type='Probability')[:, 1] if model_type == 'rf': rf = RandomForestClassifier() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=PARAMS['rf_cv'], n_iter=50, cv=5, verbose=0, random_state=42, n_jobs=-1) # Tune hyperparams rf_random.fit(X_train, y_train) best_model = rf_random.best_estimator_ # Train set only best_model.fit(X_train, y_train) get_preds = lambda model, x: model.predict_proba(x)[:, 1] if model_type == 'lr': lr = linear_model.LogisticRegression() lr_random = RandomizedSearchCV(estimator=lr, param_distributions=PARAMS['lr_cv'], n_iter=50, cv=5, verbose=0, random_state=42, n_jobs=-1) # Tune hyperparams lr_random.fit(X_train, y_train) best_model = lr_random.best_estimator_ # Train set only best_model.fit(X_train, y_train) get_preds = lambda model, x: model.predict_proba(x)[:, 1] if model_type == 'svm': svc = SVC() svc_random = RandomizedSearchCV(estimator=svc, param_distributions=PARAMS['svm_cv'], n_iter=50, cv=5, verbose=0, random_state=42, n_jobs=-1) # Tune hyperparams svc_random.fit(X_train, y_train) best_model = svc_random.best_estimator_ # Train set only best_model.fit(X_train, y_train) get_preds = lambda model, x: model.decision_function(x) y_pred_train = get_preds(best_model, X_train) y_pred_val = get_preds(best_model, X_val) train_auc = roc_auc_score(y_train, y_pred_train) val_auc = roc_auc_score(y_val, y_pred_val) train_ap = average_precision_score(y_train, y_pred_train) val_ap = average_precision_score(y_val, y_pred_val) # Final fit best_model.fit(X, y) return (get_preds(best_model, X), get_preds(best_model, X_test)), (train_auc, val_auc), (train_ap, val_ap)