def test_clone(): estimator = CatBoostClassifier(custom_metric="Accuracy", loss_function="MultiClass", iterations=400) # This is important for sklearn.base.clone since # it uses get_params for cloning estimator. params = estimator.get_params() new_estimator = CatBoostClassifier(**params) new_params = new_estimator.get_params() for param in params: assert param in new_params assert new_params[param] == params[param]
def test_clone(): estimator = CatBoostClassifier( custom_metric="Accuracy", loss_function="MultiClass", iterations=400) # This is important for sklearn.base.clone since # it uses get_params for cloning estimator. params = estimator.get_params() new_estimator = CatBoostClassifier(**params) new_params = new_estimator.get_params() for param in params: assert param in new_params assert new_params[param] is params[param]
def FineTune_hyperopt(self, X, y, mute=False): self.dataset(X, y) params_space = { 'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1), 'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1), 'bagging_temperature': hyperopt.hp.uniform("bagging_temperature", 0, 0.3) } trials = hyperopt.Trials() best = hyperopt.fmin(self.hyperopt_objective, space=params_space, algo=hyperopt.tpe.suggest, max_evals=2, trials=trials, rstate=RandomState(self.random_state)) if not mute: print("\nBest parameters:") print(best) print("\n") _parameters = self.params _parameters.update(best) _model = CatBoostClassifier(**_parameters) _cv_data = catboost.cv(self.all_train_data, _model.get_params()) if not mute: print('\nPrecise validation accuracy score: {}'.format( np.max(_cv_data['test-Accuracy-mean']))) return best
def hyperopt_objective(params): model = CatBoostClassifier( n_estimators=params["n_estimators"], # use_best_model=True,od_type='Iter',od_wait=20, verbose=2, eval_metric='AUC', od_pval=0.000001, # leaf_estimation_method=params['leaf_estimation_method'], depth=params['depth'], border_count=params['border_count'], learning_rate=params["learning_rate"], l2_leaf_reg=params['l2_leaf_reg'], bagging_temperature=params['bagging_temperature'], rsm=params['rsm']) cv_data = cv(Pool(train_set, train_label), model.get_params(), nfold=4, verbose_eval=True) # model.fit(train_pool_tp, eval_set=validate_pool_tp) # model.fit(X=train_x, y=train_y, # eval_set=(val_x, val_y)) # y_val_hat = model.predict(train_set.values) # mean_auc = roc_auc_score(train_label.values, y_val_hat) # metrics = model.eval_metrics(validate_pool_tf, ['AUC']) # mean_auc = sum(metrics['AUC'])/float(len(metrics['AUC'])) # cv_data = cv( # Pool(train_set_tf, train_label, cat_features=categorical_features_indices_tf), # model.get_params() # ) logloss = np.max(cv_data['test-Logloss-mean']) print(logloss) return logloss # as hyperopt minimises
class CatboostPredictor(PredictionModel): def __init__(self, params): self.model = CatBoostClassifier(**params) def fitModel(self, X_train, y_train): self.model.fit(X_train, y_train, verbose=True, cat_features=np.arange(381, 384)) pool = Pool(X_train, y_train, cat_features=np.arange(381, 384)) scores = cv(pool, self.model.get_params(), verbose=True) return scores
def hyperopt_objective(params): model = CatBoostClassifier(l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], iterations=500, eval_metric='Accuracy', random_seed=42, logging_level='Silent') cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params()) best_accuracy = np.max(cv_data['test-Accuracy-mean']) return 1 - best_accuracy # as hyperopt minimises
def test_serialization_final_fallback(ray_start_regular): pytest.importorskip("catboost") # This test will only run when "catboost" is installed. from catboost import CatBoostClassifier model = CatBoostClassifier(iterations=2, depth=2, learning_rate=1, loss_function="Logloss", logging_level="Verbose") reconstructed_model = ray.get(ray.put(model)) assert set(model.get_params().items()) == set( reconstructed_model.get_params().items())
def hyperopt_objective(self, params): _model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], bagging_temperature=params["bagging_temperature"], iterations=500, eval_metric='AUC', random_seed=99, verbose=False, loss_function='Logloss') _cv_data = catboost.cv(self.all_train_data, _model.get_params()) best_accuracy = np.max(_cv_data['test-AUC-mean']) return 1 - best_accuracy
def hyperopt_objective(params): model = CatBoostClassifier(l2_leaf_reg=int(params['l2_leaf_reg']), max_depth=int(params['max_depth']), iterations=150, eval_metric='Accuracy', random_seed=164530, logging_level='Silent', od_type='IncToDec', od_wait=20) cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params()) best_accuracy = np.max(cv_data['test-Accuracy-mean']) return 1 - best_accuracy # as hyperopt minimises
def objective(space): global best_score, trials_count # if os.path.isdir('./catboost_info'): # shutil.rmtree('./catboost_info', ignore_errors=True) trials_count += 1 if (trials_count % 5) == 0 and is_quit_pressed(): raise co.TennisAbortError args_dct = dict(**space) params = { "eval_metric": metric_name, # 'eval_metric': 'Logloss', "random_seed": random_state, "logging_level": "Silent", } params.update(args_dct) if how == "cv": cv_data = cv(pools.train, params, stratified=True) scr_val = np.max(cv_data[f"test-{metric_name}-mean"]) elif how == "sklearn": mdl = CatBoostClassifier(**params) mdl.fit(pools.train) pred = mdl.predict_proba(pools.eval)[:, 1] scr_val = roc_auc_score(pools.eval.y, pred) elif how == "native": mdl = CatBoost(params) mdl.fit( pools.train, eval_set=None, # pools.eval if pools.eval else None, silent=True, ) # eval_set=pools.eval pred = mdl.predict(pools.eval, prediction_type="Probability")[:, 1] scr_val = roc_auc_score(pools.eval.get_label(), pred) else: raise Exception("bad how arg {}".format(how)) # pred = mdl.predict(data.X_test) # scr_val = precision_score(data.y_test, pred) if scr_val > best_score: if how == "cv": cco.out("achieved best {} at {}".format(scr_val, params)) else: cco.out("achieved best {} at {} lrate: {} ntrees: {}".format( scr_val, mdl.get_params(), mdl.learning_rate_, mdl.tree_count_)) best_score = scr_val return {"loss": 1.0 - scr_val, "status": STATUS_OK}
def hyperopt_objective(params): model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], iterations=1000, eval_metric='F1', random_seed=42, verbose=False, loss_function='Logloss', ) cv_data = cv( Pool(data, data_label, cat_features=categorical_features_indices), model.get_params()) best_f1 = np.max(cv_data['test-F1-mean']) return 1 - best_f1
def modelBuilding(X, y, cat_features): X_train, X_validation, y_train, y_validation = train_test_split(X,\ y, train_size=0.8, random_state=1234) model = CatBoostClassifier(iterations=2000, learning_rate=0.01, task_type="GPU" #loss_function='CrossEntropy' ) model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_validation, y_validation), verbose=True) print('Model is fitted: ' + str(model.is_fitted())) print('Model params:') print(model.get_params()) return model
def train_all_save_catboost(self, X, y, categorical_features_indices): """train whole data and save the training to be use later in new predictions""" model = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1', random_seed=42, leaf_estimation_method='Newton') cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params()) print("precise validation accuracy score:{}".format(np.max(cv_data))) model.fit(X, y, cat_features=categorical_features_indices) #feature importance print(model.get_feature_importance(prettified=True)) # train = Pool(X, y, cat_features=categorical_features_indices) # feature_importances = model.get_feature_importance(train) # feature_names = X.columns # for score, name in sorted(zip(feature_importances, feature_names), reverse=True): # print('{}: {}'.format(name, score)) model.save_model('catboost_model.dump') print("Catboost model has been saved!")
def hyperopt_objective(params): model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), #learning_rate=params['learning_rate'], depth=params['depth'], iterations=500, eval_metric='Accuracy', od_type='Iter', od_wait=40, random_seed=42, logging_level='Silent', allow_writing_files=False ) cv_data = cv( train_pool, model.get_params() ) best_accuracy = np.max(cv_data['test-Accuracy-mean']) print(params, best_accuracy) return 1 - best_accuracy # as hyperopt minimises
def train_cat(model=False): global log params = grid_search_cat(True) clf = CatBoostClassifier().set_params(**params) if model: return clf params = clf.get_params() log += 'cat' log += ', learning_rate: %.3f' % params['learning_rate'] log += ', iterations: %d' % params['iterations'] log += ', depth: %d' % params['depth'] log += ', l2_leaf_reg: %d' % params['l2_leaf_reg'] log += ', border_count: %d' % params['border_count'] log += ', subsample: %d' % params['subsample'] log += ', one_hot_max_size: %d' % params['one_hot_max_size'] log += '\n\n' return train(clf)
validation_pool = Pool(data=X_validation, label=y_validation, cat_features=cat_features) ####################### # BETTER/BEST MODEL ####################### # Note: You can tinker with learning rates model = CatBoostClassifier( iterations=5, learning_rate=0.1, ) model.fit(train_pool, eval_set=validation_pool, verbose=False) # Print model info print('Model is fitted: {}'.format(model.is_fitted())) print('Model params:\n{}'.format(model.get_params())) # Choose the best iteration # Note: There is a parameter: use_best_model ( = True or False) model = CatBoostClassifier(iterations=100, ) model.fit( train_pool, eval_set=validation_pool, verbose=False, ) print('Tree count: ' + str(model.tree_count_))
class modelCatBoost(object): def __init__(self, name="CBT", random_state=99, *args, **kwargs): self.name = name self.train_dir = "model_" + str(self.name) + "/" self.random_state = random_state self.manager_models = ParamsManager(param_file, key_read="Models") self.params = self.manager_models.get_params()["CatBoost"] self.params.update({ 'train_dir': self.train_dir, "random_state": self.random_state }) self.model = CatBoostClassifier(**self.params) def dataset(self, X, y, categorical_columns_indices=None, test_size=0.2, *args, **kwargs): self.categorical_columns_indices = categorical_columns_indices self.X = X self.columns = list(X) self.y, self.cat_replace = self.replace_multiclass(y) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=test_size, random_state=self.random_state) self.train_data = catboost.Pool( data=self.X_train.values, label=self.y_train.values, cat_features=self.categorical_columns_indices) self.eval_data = catboost.Pool( data=self.X_test.values, label=self.y_test.values, cat_features=self.categorical_columns_indices) self.all_train_data = catboost.Pool( data=self.X.values, label=self.y.values, cat_features=self.categorical_columns_indices) def replace_multiclass(self, targets): _unic = targets.unique().tolist() _remp = np.arange(0, len(_unic)).tolist() return targets.replace(_unic, _remp), _unic def fit(self, X, y, use_best_model=True, plot=True, save_snapshot=False, verbose=0, *args, **kwargs): self.dataset(X, y) _params = self.model.get_params() if verbose: _verbose = 0 else: _verbose = _params["verbose"] return self.model.fit(self.train_data, verbose=_verbose, eval_set=self.eval_data, use_best_model=use_best_model, plot=plot, save_snapshot=save_snapshot, **kwargs) _preds = self.model.predict(self.dvalid) preds_test = np.where(_preds > 0.5, 1, 0) score_test = accuracy_score(self.y_test, preds_test) _preds = self.model.predict(self.dtrain) preds_train = np.where(_preds > 0.5, 1, 0) score_train = accuracy_score(self.y_train, preds_train) if not verbose == 0: print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%". format(score_train * 100)) print("Accurancy para el conjunto de validacion ------> {:.2f}%". format(score_test * 100)) def fit_cv(self, X, y, fold_count=4, shuffle=True, stratified=True, plot=True, verbose=100): self.dataset(X, y) _params = self.model.get_params() _params.update({'verbose': verbose}) _scores = catboost.cv(pool=self.all_train_data, params=_params, fold_count=fold_count, seed=self.random_state, shuffle=shuffle, verbose=verbose, plot=plot) if not verbose == 0: print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'. format( np.max(_scores['test-Accuracy-mean']), _scores['test-Accuracy-std'][np.argmax( _scores['test-Accuracy-mean'])], np.argmax(_scores['test-Accuracy-mean']))) return _scores def copy(self, *args, **kwargs): returned_classifier = CatBoostClassifier() returned_classifier.catboost_classifier = self.model.copy() returned_classifier.columns = self.columns return returned_classifier def update_model(self, **kwargs): for k, v in kwargs.items(): setattr(self.model, k, v) def save_model(self, direct="./checkpoints", name="catboost_model"): if not os.path.isdir(direct): try: os.mkdir(direct) print("Directorio creado: " + direct) except OSError as e: raise NameError("Error al crear el directorio") current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") filename = direct + "/" + name + "_" + current_time + ".dump" self.model.save_model(filename) print("Modelo guardado en la ruta: " + filename) def load_model(self, direct="./checkpoints", name="catboost_model"): if not os.path.isdir(direct): print("no existe el drectorio especificado") filename = direct + "/" + name + ".dump" self.model.load_model(filename) print("Modelo cargado de la ruta: " + filename) def predict(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict(_X_copy.values, *args, **kwargs) def predict_proba(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict_proba(_X_copy.values, *args, **kwargs) def add_cat_features(self, index_features): self.categorical_columns_indices = index_features print(self.categorical_columns_indices) self.train_data = catboost.Pool( data=self.X_train, label=self.y_train, cat_features=self.categorical_columns_indices) self.eval_data = catboost.Pool( data=self.X_test, label=self.y_test, cat_features=self.categorical_columns_indices) self.all_train_data = catboost.Pool( data=self.X, label=self.y, cat_features=self.categorical_columns_indices) def index_features(self, features): _index = [] for i in features: _index.append(self.X.columns.get_loc(i)) if _index == []: raise NameError("No coincide ninguna de las features introducidas") return _index def get_important_features(self, display=True): self.model.get_feature_importance(prettified=True) _feature_importance_df = self.model.get_feature_importance( prettified=True) if display: plt.figure(figsize=(12, 6)) sns.barplot(x="Importances", y="Feature Id", data=_feature_importance_df) plt.title('CatBoost features importance:') return _feature_importance_df def Visualizer_Models(self, directs=None, visu_model=True): directorios = [] if len(directs) < 0: if visu_model: directorios.append(self.train_dir) else: raise NameError("No se ha seleccionado ningun directorio") else: if visu_model: directorios.append(self.train_dir) for i in directs: directorios.append(i) print(directorios) widget = MetricVisualizer(directorios) widget.start() def hyperopt_objective(self, params): _model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], bagging_temperature=params["bagging_temperature"], iterations=500, eval_metric='AUC', random_seed=99, verbose=False, loss_function='Logloss') _cv_data = catboost.cv(self.all_train_data, _model.get_params()) best_accuracy = np.max(_cv_data['test-AUC-mean']) return 1 - best_accuracy def FineTune_hyperopt(self, X, y, mute=False): self.dataset(X, y) params_space = { 'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1), 'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1), 'bagging_temperature': hyperopt.hp.uniform("bagging_temperature", 0, 0.3) } trials = hyperopt.Trials() best = hyperopt.fmin(self.hyperopt_objective, space=params_space, algo=hyperopt.tpe.suggest, max_evals=2, trials=trials, rstate=RandomState(self.random_state)) if not mute: print("\nBest parameters:") print(best) print("\n") _parameters = self.params _parameters.update(best) _model = CatBoostClassifier(**_parameters) _cv_data = catboost.cv(self.all_train_data, _model.get_params()) if not mute: print('\nPrecise validation accuracy score: {}'.format( np.max(_cv_data['test-Accuracy-mean']))) return best def FineTune_sklearn(self, X, y, mute=False, n_splits=10, n_iter=2): """ https://www.kaggle.com/ksaaskil/pets-definitive-catboost-tuning """ self.dataset(X, y) def build_search(modelo, param_distributions, cv=5, n_iter=10, verbose=1, random_state=99): """ Builder function for RandomizedSearch. """ QWS = make_scorer(cohen_kappa_score, weights='quadratic') return RandomizedSearchCV(modelo, param_distributions=param_distributions, cv=cv, return_train_score=True, refit='cohen_kappa_quadratic', n_iter=n_iter, n_jobs=None, scoring={ 'accuracy': make_scorer(accuracy_score), 'cohen_kappa_quadratic': QWS }, verbose=verbose, random_state=random_state) def pretty_cv_results(cv_results, sort_by='rank_test_cohen_kappa_quadratic', sort_ascending=True, n_rows=30): """ Return pretty Pandas dataframe from the `cv_results_` attribute of finished parameter search, ranking by test performance and only keeping the columns of interest. """ df = pd.DataFrame(cv_results) cols_of_interest = [ key for key in df.keys() if key.startswith('param_') or key.startswith("mean_train") or key.startswith("std_train") or key.startswith("mean_test") or key.startswith("std_test") or key.startswith('mean_fit_time') or key.startswith('rank') ] return df.loc[:, cols_of_interest].sort_values( by=sort_by, ascending=sort_ascending).head(n_rows) def run_search(X_train, y_train, search, mute=False): search.fit(X_train, y_train) print('Best score is:', search.best_score_) return pretty_cv_results(search.cv_results_) param_distributions = { 'iterations': [100, 200], 'learning_rate': scipy.stats.uniform(0.01, 0.3), 'max_depth': scipy.stats.randint(3, 10), 'one_hot_max_size': [30], 'l2_leaf_reg': scipy.stats.reciprocal(a=1e-2, b=1e1), } if mute: _verbose = 0 else: _verbose = 1 self.params.update({'use_best_model': False}) _model = CatBoostClassifier(**self.params) catboost_search = build_search(_model, param_distributions=param_distributions, n_iter=n_iter, verbose=_verbose, cv=RepeatedStratifiedKFold( n_splits=n_splits, n_repeats=1, random_state=self.random_state)) catboost_cv_results = run_search(self.X, self.y, search=catboost_search, mute=mute) best_estimator = catboost_search.best_estimator_ if not mute: print(best_estimator.get_params()) return catboost_cv_results, best_estimator def __getattr__(self, attr): """ Pass all other method calls to self.model. """ return getattr(self.model, attr)
# CatBoost model definition catboost_model = CatBoostClassifier(iterations=200, custom_loss=['Accuracy'], loss_function='Logloss') # Fit CatBoost model catboost_model.fit(train_pool) #,plot=True) # CatBoost accuracy acc_catboost = round(catboost_model.score(x_train, y_train) * 100, 2) # How long will this take? start_time = time.time() # Set params for cross-validation as same as initial model cv_params = catboost_model.get_params() # Run the cross-validation for 10-folds (same as the other models) cv_data = cv(train_pool, cv_params, fold_count=10) #,plot=True) # How long did it take? catboost_time = (time.time() - start_time) # CatBoost CV results save into a dataframe (cv_data), let's withdraw the maximum accuracy score acc_cv_catboost = round(np.max(cv_data['test-Accuracy-mean']) * 100, 2) """ MLP classification """ x_train = torch.tensor(x_train.values).float() x_test = torch.tensor(x_test.values).float() x_valid = torch.tensor(x_valid.values).float()
eval_set=(X_validation, y_validation), # logging_level='Verbose', # you can uncomment this for text output plot=True) # As you can see, it is possible to watch our model learn through verbose output or with nice plots (personally I would definately go with the second option - just check out those plots: you can, for example, zoom in areas of interest!) # # With this we can see that the best accuracy value of **0.8341** (on validation set) was acheived on **503th** boosting step. # ### 2.2 Model Cross-Validation # # It is good to validate your model, but to cross-validate it - even better. And also with plots! So with no more words: # In[13]: cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params(), plot=True) # Now we have values of our loss functions at each boosting step averaged by 10 folds, which should provide us with a more accurate estimation of our model performance: # In[14]: print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format( np.max(cv_data['test-Accuracy-mean']), cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])], np.argmax(cv_data['test-Accuracy-mean']))) # In[15]: print('Precise validation accuracy score: {}'.format( np.max(cv_data['test-Accuracy-mean'])))
from src.data.processed_data import aX_i_train, ay_i_train, bX_i_train, by_i_train, cX_i_train, cy_i_train from src.data.make_dataset import get_categorical_indices import numpy as np from catboost import CatBoostClassifier, cv, Pool from sklearn.cross_validation import StratifiedKFold a_indices, b_indices, c_indices = get_categorical_indices( aX_h_train), get_categorical_indices(bX_h_train), get_categorical_indices( cX_h_train) model_a = CatBoostClassifier(nan_mode='Min') model_a.fit(aX_h_train, ay_h_train, cat_features=a_indices) model_b = CatBoostClassifier(nan_mode='Min') model_b.fit(bX_h_train, by_h_train, cat_features=b_indices) model_c = CatBoostClassifier(nan_mode='Min') model_c.fit(cX_h_train, cy_h_train, cat_features=c_indices) cv_data_a = cv(params=model_a.get_params(), pool=Pool(aX_h_train, ay_h_train, cat_features=a_indices)) a_score = cv_data_a['Logloss_test_avg'][-1] cv_data_b = cv(params=model_b.get_params(), pool=Pool(bX_h_train, by_h_train, cat_features=b_indices)) b_score = cv_data_b['Logloss_test_avg'][-1] cv_data_c = cv(params=model_c.get_params(), pool=Pool(cX_h_train, cy_h_train, cat_features=c_indices)) c_score = cv_data_c['Logloss_test_avg'][-1]
# load model model = CatBoostClassifier() model.load_model('models/catboost_model_4.dump') # Feature Importance: Know which feature contributed the most feature_importances = model.get_feature_importance(train_pool) feature_names = pd.DataFrame(X_train).columns for score, name in sorted(zip(feature_importances, feature_names), reverse=True): print('{}: {}'.format(name, score)) print('\n\n\n') print(model.get_best_score()) print(model.get_params()) # Validation Prediction probabilities = model.predict(eval_pool) # print(probabilities) pd.DataFrame(probabilities).to_csv('validation-scores/val-scores-3.csv') # TEST VALUES # preped_test_values = np.array(pd.read_csv('preped/preped_test_&_featured.csv')) # eval_dataset = Pool(test_values) # test_prediction = model.predict(preped_test_values)
iterations=2000, learning_rate=0.01, ) #fitting model on test-train split data clf_cat.fit(xtrain, ytrain, cat_features=cat_feature, eval_set=(xvalid, yvalid), early_stopping_rounds=100, verbose=False ) print('CatBoost model is fitted: ' + str(clf_cat.is_fitted())) print('CatBoost model parameters:') print(clf_cat.get_params()) predictions = clf_cat.predict(xvalid) print("accuracy_score", accuracy_score(yvalid predictions)) predictions_probas = clf_cat.predict_proba(xvalid) score=gini_normalized(yvalid,predictions_probas) print(score) print('Confusion matrix\n',confusion_matrix(yvalid,predictions)) #completely training the whole train dataset after analysing the gini index value clf_cat.fit(X_train1, Y_train1,cat_features=cat_feature,verbose=False) result = clf_cat.predict_proba(X_out)[:,1] result
class CatBoostModel(object): def __init__(self, df, doCreateFeatures=False, Ycol=None, Xcolumns=None, otherColumns=None, requiredColumns=[]): #df = pd.DataFrame.from_records(events) self.df = df self.model = None self.Xcolumns = Xcolumns self.Ycol = Ycol self.otherColumns = otherColumns #self.df=df self.requiredColumns = requiredColumns self.df = self.fixFeatureTypes(df) self.df = self.dropColumnsWithNull(df, self.requiredColumns) if doCreateFeatures == True: self.df = self.createFeaturesForNewData(df, createFeatures) self.categorical_features_indices = None target = Ycol self.y = self.df[target] self.X = self.df[Xcolumns.keys()] #print (self.y.dtypes) self.categorical_features_indices = np.where( self.X.dtypes != np.float)[0] self.X_train, self.X_validation, self.y_train, self.y_validation = train_test_split( self.X, self.y, train_size=0.75, random_state=42) def save(self, name): self.model.save_model(f'./data/{name}') def load(self, name): self.model = CatBoostClassifier().load_model(f'./data/{name}') def fixFeatureTypes(self, df): df = self.convertModelFeatures(df, self.Xcolumns) df = self.convertModelFeatures(df, self.Ycol) df = self.convertModelFeatures(df, self.otherColumns) return df def predict(self, items): results = [] for item in items: #logging.warning(item) #print (item) df = pd.DataFrame.from_records([item]) df = self.fixFeatureTypes(df) df = self.createFeaturesForNewData(df, createFeatures) self.df = self.df.append(df, ignore_index=True) df = df.drop(self.Ycol, axis=1) preddf = df[self.Xcolumns.keys()] predictions = self.model.predict(preddf) predictions_probs = self.model.predict_proba(preddf) preddf.loc[preddf.index[0], 'prediction'] = predictions[0] preddf.loc[preddf.index[0], 'proba'] = predictions_probs[0][1] result = preddf.to_dict('records')[0] for k, v in result.items(): item[k] = v results.append(item) return results def train(self): self.buildModel() self.fitModel() self.logPrecision() def dropColumnsWithNull(self, df, columns): for col in columns: df = df[pd.notnull(df[col])] return df def buildModel(self): logging.warning('Building Model') self.model = CatBoostClassifier(custom_loss=['Accuracy'], random_seed=42, logging_level='Silent') def fitModel(self): logging.warning('Fitting Model') self.model.fit( self.X_train, self.y_train, cat_features=self.categorical_features_indices, eval_set=(self.X_validation, self.y_validation), logging_level='Verbose' # you can uncomment this for text output ) def logPrecision(self): cv_params = self.model.get_params() cv_params.update({'loss_function': 'Logloss'}) cv_data = cv( Pool(self.X, self.y, cat_features=self.categorical_features_indices), cv_params) logging.warning( 'Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format( np.max(cv_data['test-Accuracy-mean']), cv_data['test-Accuracy-std'][np.argmax( cv_data['test-Accuracy-mean'])], np.argmax(cv_data['test-Accuracy-mean']))) logging.warning('Precise validation accuracy score: {}'.format( np.max(cv_data['test-Accuracy-mean']))) def convertModelFeatures(self, df, features): for k, v in features.items(): try: if v == "object": df[k].fillna('Nothing', inplace=True) df[k] = df[k].replace('', 'Nothing') df[k] = df[k].astype(object) elif v == "float": df[k].fillna(0, inplace=True) df[k] = df[k].apply(pd.to_numeric, errors='ignore', downcast='float') elif v == "datetime": df[k] = df[k].apply(pd.to_datetime, errors='ignore') elif v == "bool": df[k] = df[k].astype(float) except KeyError: pass return df def createFeaturesForNewData(self, df, f): #number of predictions nb = len(df) fulldf = self.df logging.warning(f"creating features for {nb} predictions") fulldf = fulldf.append(df, ignore_index=True) lastPreds = fulldf.tail(nb) # we get the correct indexes this way for index, row in lastPreds.iterrows(): fulldf = f(fulldf, index, row) result = fulldf.tail(nb) return result
class CatBoost(Model): def fit(self, X_train, y_train, X_val=None, y_val=None): # Fit the model self.model = CatBoostClassifier(verbose=False) self.model.fit(X_train, y_train) return def tune_best(self, X_train, y_train, X_val, y_val): params = { 'iterations': 500, 'learning_rate': 0.001, 'eval_metric': 'Logloss', 'random_seed': 42, 'logging_level': 'Silent', 'use_best_model': False } train_pool = Pool(X_train, y_train) validate_pool = Pool(X_val, y_val) self.model = CatBoostClassifier(**params) self.model.fit(train_pool, eval_set=validate_pool) best_model_params = params.copy() best_model_params.update({ 'use_best_model': True }) self.model = CatBoostClassifier(**best_model_params) self.model.fit(train_pool, eval_set=validate_pool, logging_level='Verbose') return def tune(self, X_train, y_train, X_val, y_val): self.model = CatBoostClassifier( custom_loss=['Logloss'], random_seed=42, logging_level='Silent' ) self.model.fit( X_train, y_train, eval_set=(X_val, y_val), logging_level='Verbose', plot=True ); cv_params = self.model.get_params() cv_params.update({ 'loss_function': 'Logloss' }) cv_data = cv( Pool(X_train, y_train), cv_params, plot=True ) print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format( np.max(cv_data['test-Logloss-mean']), cv_data['test-Logloss-std'][np.argmax(cv_data['test-Logloss-mean'])], np.argmax(cv_data['test-Logloss-mean']) )) print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Logloss-mean']))) def transform(self, X_test): predictions = self.model.predict_proba(X_test) return predictions def predict(self, X_test): predictions = self.model.predict_proba(X_test)[:,1] return predictions def evaluate(self, X_test, y_test): y_pred = self.transform(X_test) score = log_loss(y_test, y_pred) print("LOG LOSS : ", score) return
[ print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_cbc ] #OTIMIZAÇÃO DE HIPERPARAMETROS - CATBOOST #Grid Search With Cross Validation from catboost import CatBoostClassifier from sklearn.model_selection import GridSearchCV classifier_cbc_gscv = CatBoostClassifier() classifier_cbc_gscv.get_params().keys() grid_param_cbc = { 'depth': [4, 7, 10], 'iterations': [100, 200, 300, 400, 500], 'l2_leaf_reg': [1, 4, 9], 'learning_rate': [0.03, 0.1, 0.15] } classifier_cbc_gscv_gd_sr = GridSearchCV(estimator=classifier_cbc_gscv, param_grid=grid_param_cbc, scoring='accuracy', cv=10) classifier_cbc_gscv_gd_sr.fit(X_train, Y_train)
clf = CatBoostClassifier( iterations=2000, learning_rate=0.1, #loss_function='CrossEntropy' ) clf.fit(X_train, y_train, cat_features=categorical_columns, eval_set=(X_val, y_val), verbose=False) print('CatBoost model is fitted: ' + str(clf.is_fitted())) print('CatBoost model parameters:') print(clf.get_params()) predictions = clf.predict(X_val) print("accuracy_score", accuracy_score(y_val, predictions)) predictions_probas = clf.predict_proba(X_val) print("roc-auc score for the class 1, from target 'HasDetections' ", roc_auc_score(y_val, predictions_probas[:, 1])) val_cnf_matrix = confusion_matrix(y_val, predictions) sns.heatmap(val_cnf_matrix, annot=True, fmt='.2f', cmap="BrBG").set_title("Validation") plt.show() #completely training the whole train dataset
def cat_boost(): print("Start of Cat boost") X, y = data_pre_processing() X_train, X_test, y_train, y_test = data_processor() eval_set = [(X_test, y_test)] cb_model = CatBoostClassifier(iterations=1375, learning_rate=0.109499, depth=6, thread_count=10, eval_metric='AUC', bagging_temperature=0.9, od_type='Iter', metric_period=75, loss_function='Logloss', od_wait=100) cb_model.fit(X_train, y_train, eval_set=eval_set, cat_features=categorical_features_pos, verbose=True) print("Model Evaluation Stage") print(cb_model.get_params()) print("\nevaluate predictions") catpred = cb_model.predict(X_test) # evaluate predictions accuracy = accuracy_score(y_test, catpred) f_score = f1_score(y_test, catpred) print("Accuracy : %.2f%%" % (accuracy * 100.0)) print("F1 Score : %.2f%%" % (f_score * 100.0)) print('Confusion Matrix :') print(confusion_matrix(y_test, catpred)) print('Report : ') print(classification_report(y_test, catpred)) print(mean_squared_error(y_test, catpred)) # catpred = cb_model.predict(X_test) # print("Cat BoostRegressor Score: ", catpred.score(X_test, y_test)) #Make Prediction and Output for Scoring print('Final Result: Make Prediction and Output Score') #test_values =pd.read_csv('Data/df_test_enc_2.csv') df_test_values_trf = pd.read_csv('../Data/df_test_enc_2.csv') # df_test_values_trf = preprocessing.normalize(df_test_values_trf, axis =0) # #df_trf = df_trf.astype(int) # #df_trf = df_trf.round() # #df_enc.dtypes # df_test_values_trf = pd.DataFrame(df_test_values_trf,columns = df_trf.columns) #df_test_values_trf = clean_dataset(df_test_values_trf) # col_names = df_test_values_trf.columns # features = df_test_values_trf[col_names] # imp = Imputer(strategy="most_frequent").fit(df_test_values_trf) # features = imp.transform(df_test_values_trf) # scaler = preprocessing.StandardScaler().fit(features) # features = scaler.transform(features) # df_test_values_trf[col_names] = features # cate = df_test_values_trf.columns #print(cate) #data_norm = preprocessing.normalize(df_test_values_trf, axis = 1) #df_test_values_trf = np.concatenate([data_norm]) #df = pd.DataFrame(df_test_values_trf, columns=cate) test_values = df_test_values_trf.drop(['Unnamed: 0'], axis=1) #test_values = test_values.astype(int) test_values = np.array(test_values) # Make predictions using the testing set cb_pred = cb_model.predict(test_values) L_prediccion = pd.DataFrame(data=cb_pred, columns=['accepted']) print(L_prediccion.shape) L_prediccion.index.names = ['row_id'] L_prediccion['accepted'] = L_prediccion['accepted'].astype(np.int64) print(L_prediccion.shape) print(L_prediccion.head()) L_prediccion.to_csv('../Data/submission_1.csv') print("End of Cat boost")
def cat_boost(): print("Start of Cat boost") X, y = data_pre_processing() X_train, X_test, y_train, y_test = data_processor() eval_set = [(X_test, y_test)] cb_model = CatBoostClassifier(iterations=1375, learning_rate= 0.1094999, depth=6, thread_count = 10, eval_metric='AUC', #eval_metric='Accuracy', bagging_temperature = 0.9, od_type='IncToDec', # l2_leaf_reg= 6, metric_period = 75, random_seed = 42, #logging_level= 'Silent', random_strength = 1.0, nan_mode = "Min", scale_pos_weight = 1.0, od_wait=100) cb_model.fit(X_train, y_train, eval_set=eval_set, cat_features = categorical_features_pos, verbose=True) print("Model Evaluation Stage") print(cb_model.get_params()) print("\nevaluate predictions") catpred = cb_model.predict(X_test) # evaluate predictions accuracy = accuracy_score(y_test, catpred) f_score = f1_score(y_test, catpred) print("Accuracy : %.2f%%" % (accuracy * 100.0)) print("F1 Score : %.2f%%" % (f_score * 100.0)) print('Confusion Matrix :') print(confusion_matrix(y_test, catpred)) print('Report : ') print(classification_report(y_test, catpred)) print(mean_squared_error(y_test, catpred)) # keep probabilities for the positive outcome only probs = cb_model.predict_proba(X_test)[:, 1] # predict class values yhat = cb_model.predict(X_test) # calculate precision-recall curve precision, recall, thresholds = precision_recall_curve(y_test, probs) # calculate F1 score f1 = f1_score(y_test, yhat) # calculate precision-recall AUC auc_c = auc(recall, precision) # calculate average precision score ap = average_precision_score(y_test, probs) print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc_c, ap)) plt.figure(figsize=(12, 6)) # plot no skill plt.plot([0, 1], [0.5, 0.5], linestyle='--', label="No Skill") # plot the precision-recall curve for the model plt.plot(recall, precision, marker='.', label="precision-recall curve") # show the plot # Line Plot of Precision-Recall Curve plt.title("Line Plot of Precision-Recall Curve", {"fontsize": 16}); plt.ylabel('Precision (y-axis)') plt.xlabel('Recall (x-axis)') plt.show() #Make Prediction and Output for Scoring print('Final Result: Make Prediction and Output Score') #test_values =pd.read_csv('Data/df_test_enc_2.csv') df_test_values_trf = pd.read_csv('../Data/df_test_enc_3.csv') # df_test_values_trf = preprocessing.normalize(df_test_values_trf, axis =0) # #df_trf = df_trf.astype(int) # #df_trf = df_trf.round() # #df_enc.dtypes # df_test_values_trf = pd.DataFrame(df_test_values_trf,columns = df_trf.columns) #df_test_values_trf = clean_dataset(df_test_values_trf) # col_names = df_test_values_trf.columns # features = df_test_values_trf[col_names] # imp = Imputer(strategy="most_frequent").fit(df_test_values_trf) # features = imp.transform(df_test_values_trf) # scaler = preprocessing.StandardScaler().fit(features) # features = scaler.transform(features) # df_test_values_trf[col_names] = features # cate = df_test_values_trf.columns #print(cate) #data_norm = preprocessing.normalize(df_test_values_trf, axis = 1) #df_test_values_trf = np.concatenate([data_norm]) #df = pd.DataFrame(df_test_values_trf, columns=cate) # this function loops through columns in a data set and defines a predefined scaler to each # numeric_columns = ['loan_amount','msa_md', 'state_code', 'lender', 'county_code', 'applicant_income', # 'population', 'minority_population_pct','applicant_ethnicity', # 'ffiecmedian_family_income', 'tract_to_msa_md_income_pct', # 'number_of_owner-occupied_units', 'number_of_1_to_4_family_units'] # scaler = MinMaxScaler() # df_test_values_trf = scale_numeric(df_test_values_trf, numeric_columns, scaler) # #df = round(df) # # convert all DataFrame columns to the int64 dtype # df_test_values_trf = round(df_test_values_trf).astype(int) test_values = df_test_values_trf.drop(['Unnamed: 0'],axis=1) #test_values = test_values.astype(int) test_values=np.array(test_values) # Make predictions using the testing set cb_pred = cb_model.predict(test_values) L_prediccion=pd.DataFrame(data=cb_pred,columns=['accepted']) print(L_prediccion.shape) L_prediccion.index.names=['row_id'] L_prediccion['accepted']= L_prediccion['accepted'].astype(np.int64) print(L_prediccion.shape) print(L_prediccion.head()) L_prediccion.to_csv('../Data/submission_1.csv') print("End of Cat boost")
from sklearn.metrics import recall_score,precision_score print(recall_score(y_test,y_pred,average='macro')) print(precision_score(y_test, y_pred, average='micro')) print(accuracy_score(y_test,y_pred)) #cr0ss validati0n cv_params = clf.get_params() cv_params.update({ 'loss_function': 'Logloss' }) cv_data = cv( Pool(X, y, cat_features=cat_featuresind), cv_params, plot=True ) print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format( np.max(cv_data['test-Accuracy-mean']), cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])], np.argmax(cv_data['test-Accuracy-mean']) ))
def model_catboost(self, X, y, X_train, y_train, X_test, y_test, categorical_features_indices, target, file): print("Processing CATBOOST....") # Adicione esto: inicio train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices) validate_pool = Pool(X_test, y_test, cat_features=categorical_features_indices) # fin # model=CatBoostClassifier(loss_function='MultiClass',use_best_model=True, random_seed=42)#, class_weights=[1,2,3,4,5,6,7,8,9,10,11]) model = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1', use_best_model=True, random_seed=42, leaf_estimation_method='Newton') model.fit(train_pool, eval_set=validate_pool, use_best_model=True, verbose=50, plot=False, early_stopping_rounds=100) # cross-validation cv_params = model.get_params() cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), cv_params, fold_count=10, plot=False) print('Precise validation accuracy score: {}'.format( np.max(cv_data))) # ['TotalF1'] # fin print("PRIMER prediccion") print() print(model) # make predictions expected_y = y_test predicted_y = model.predict(X_test) # summarize the fit of the model print() print(metrics.classification_report(expected_y, predicted_y)) print() print(metrics.confusion_matrix(expected_y, predicted_y)) print("SEGUNDO prediccion") print(model.best_iteration_, model.best_score_) print(model.evals_result_['validation']['MultiClass'][-10:]) # prediction pred = model.predict(X_test) print("PREDICT") print(pred) print("print dataframe predictions:") cm = pd.DataFrame() # cm['DAMAGE'] = y_test cm[target] = y_test cm['Predict'] = model.predict(X_test) print(cm) print("SCORES") print(model.score(X_test, y_test)) cm.to_csv(file) # , index=False) # cm.to_csv("catboost_prediction.csv")#, index=False) # confusion matrix print("confusion matrix:") # conf_mat = get_confusion_matrix(model, Pool(X_train, y_train, cat_features=categorical_features_indices)) conf_mat = get_confusion_matrix( model, Pool(X_test, y_test, cat_features=categorical_features_indices)) print(conf_mat) # feature selection print(model.get_feature_importance(prettified=True)) # feature_importances = model.get_feature_importance(train_pool) # feature_names = X_train.columns # for score, name in sorted(zip(feature_importances, feature_names), reverse=True): # print('{}: {}'.format(name, score)) ## return model, cv_data
model = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42) # In[11]: #now just to make the model to fit the data model.fit(xtrain,ytrain,cat_features=cate_features_index,eval_set=(xtest,ytest)) # In[12]: #for the data is not so big, we need use the cross-validation(cv) for the model, to find how #good the model is ,I just use the 10-fold cv cv_data = cv(model.get_params(),Pool(x,y,cat_features=cate_features_index),fold_count=10) # In[13]: #show the acc for the model print('the best cv accuracy is :{}'.format(np.max(cv_data["b'Accuracy'_test_avg"]))) # In[14]: #show the model test acc, but you have to note that the acc is not the cv acc, #so recommend to use the cv acc to evaluate your model! print('the test accuracy is :{:.6f}'.format(accuracy_score(ytest,model.predict(xtest))))