def __init__(self, random_state=99): """ Inicializacion de la clase de modelos """ self.random_state = random_state self.manager_models = ParamsManager(param_file, key_read="Models") self.manager_finetune = ParamsManager(param_file, key_read="FineTune")
def __init__(self, name="CBT", random_state=99, *args, **kwargs): self.name = name self.train_dir = "model_" + str(self.name) + "/" self.random_state = random_state self.manager_models = ParamsManager(param_file, key_read="Models") self.params = self.manager_models.get_params()["CatBoost"] self.params.update({ 'train_dir': self.train_dir, "random_state": self.random_state }) self.model = CatBoostClassifier(**self.params)
class modelCatBoost(object): def __init__(self, name="CBT", random_state=99, *args, **kwargs): self.name = name self.train_dir = "model_" + str(self.name) + "/" self.random_state = random_state self.manager_models = ParamsManager(param_file, key_read="Models") self.params = self.manager_models.get_params()["CatBoost"] self.params.update({ 'train_dir': self.train_dir, "random_state": self.random_state }) self.model = CatBoostClassifier(**self.params) def dataset(self, X, y, categorical_columns_indices=None, test_size=0.2, *args, **kwargs): self.categorical_columns_indices = categorical_columns_indices self.X = X self.columns = list(X) self.y, self.cat_replace = self.replace_multiclass(y) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=test_size, random_state=self.random_state) self.train_data = catboost.Pool( data=self.X_train.values, label=self.y_train.values, cat_features=self.categorical_columns_indices) self.eval_data = catboost.Pool( data=self.X_test.values, label=self.y_test.values, cat_features=self.categorical_columns_indices) self.all_train_data = catboost.Pool( data=self.X.values, label=self.y.values, cat_features=self.categorical_columns_indices) def replace_multiclass(self, targets): _unic = targets.unique().tolist() _remp = np.arange(0, len(_unic)).tolist() return targets.replace(_unic, _remp), _unic def fit(self, X, y, use_best_model=True, plot=True, save_snapshot=False, verbose=0, *args, **kwargs): self.dataset(X, y) _params = self.model.get_params() if verbose: _verbose = 0 else: _verbose = _params["verbose"] return self.model.fit(self.train_data, verbose=_verbose, eval_set=self.eval_data, use_best_model=use_best_model, plot=plot, save_snapshot=save_snapshot, **kwargs) _preds = self.model.predict(self.dvalid) preds_test = np.where(_preds > 0.5, 1, 0) score_test = accuracy_score(self.y_test, preds_test) _preds = self.model.predict(self.dtrain) preds_train = np.where(_preds > 0.5, 1, 0) score_train = accuracy_score(self.y_train, preds_train) if not verbose == 0: print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%". format(score_train * 100)) print("Accurancy para el conjunto de validacion ------> {:.2f}%". format(score_test * 100)) def fit_cv(self, X, y, fold_count=4, shuffle=True, stratified=True, plot=True, verbose=100): self.dataset(X, y) _params = self.model.get_params() _params.update({'verbose': verbose}) _scores = catboost.cv(pool=self.all_train_data, params=_params, fold_count=fold_count, seed=self.random_state, shuffle=shuffle, verbose=verbose, plot=plot) if not verbose == 0: print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'. format( np.max(_scores['test-Accuracy-mean']), _scores['test-Accuracy-std'][np.argmax( _scores['test-Accuracy-mean'])], np.argmax(_scores['test-Accuracy-mean']))) return _scores def copy(self, *args, **kwargs): returned_classifier = CatBoostClassifier() returned_classifier.catboost_classifier = self.model.copy() returned_classifier.columns = self.columns return returned_classifier def update_model(self, **kwargs): for k, v in kwargs.items(): setattr(self.model, k, v) def save_model(self, direct="./checkpoints", name="catboost_model"): if not os.path.isdir(direct): try: os.mkdir(direct) print("Directorio creado: " + direct) except OSError as e: raise NameError("Error al crear el directorio") current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") filename = direct + "/" + name + "_" + current_time + ".dump" self.model.save_model(filename) print("Modelo guardado en la ruta: " + filename) def load_model(self, direct="./checkpoints", name="catboost_model"): if not os.path.isdir(direct): print("no existe el drectorio especificado") filename = direct + "/" + name + ".dump" self.model.load_model(filename) print("Modelo cargado de la ruta: " + filename) def predict(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict(_X_copy.values, *args, **kwargs) def predict_proba(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict_proba(_X_copy.values, *args, **kwargs) def add_cat_features(self, index_features): self.categorical_columns_indices = index_features print(self.categorical_columns_indices) self.train_data = catboost.Pool( data=self.X_train, label=self.y_train, cat_features=self.categorical_columns_indices) self.eval_data = catboost.Pool( data=self.X_test, label=self.y_test, cat_features=self.categorical_columns_indices) self.all_train_data = catboost.Pool( data=self.X, label=self.y, cat_features=self.categorical_columns_indices) def index_features(self, features): _index = [] for i in features: _index.append(self.X.columns.get_loc(i)) if _index == []: raise NameError("No coincide ninguna de las features introducidas") return _index def get_important_features(self, display=True): self.model.get_feature_importance(prettified=True) _feature_importance_df = self.model.get_feature_importance( prettified=True) if display: plt.figure(figsize=(12, 6)) sns.barplot(x="Importances", y="Feature Id", data=_feature_importance_df) plt.title('CatBoost features importance:') return _feature_importance_df def Visualizer_Models(self, directs=None, visu_model=True): directorios = [] if len(directs) < 0: if visu_model: directorios.append(self.train_dir) else: raise NameError("No se ha seleccionado ningun directorio") else: if visu_model: directorios.append(self.train_dir) for i in directs: directorios.append(i) print(directorios) widget = MetricVisualizer(directorios) widget.start() def hyperopt_objective(self, params): _model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], bagging_temperature=params["bagging_temperature"], iterations=500, eval_metric='AUC', random_seed=99, verbose=False, loss_function='Logloss') _cv_data = catboost.cv(self.all_train_data, _model.get_params()) best_accuracy = np.max(_cv_data['test-AUC-mean']) return 1 - best_accuracy def FineTune_hyperopt(self, X, y, mute=False): self.dataset(X, y) params_space = { 'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1), 'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1), 'bagging_temperature': hyperopt.hp.uniform("bagging_temperature", 0, 0.3) } trials = hyperopt.Trials() best = hyperopt.fmin(self.hyperopt_objective, space=params_space, algo=hyperopt.tpe.suggest, max_evals=2, trials=trials, rstate=RandomState(self.random_state)) if not mute: print("\nBest parameters:") print(best) print("\n") _parameters = self.params _parameters.update(best) _model = CatBoostClassifier(**_parameters) _cv_data = catboost.cv(self.all_train_data, _model.get_params()) if not mute: print('\nPrecise validation accuracy score: {}'.format( np.max(_cv_data['test-Accuracy-mean']))) return best def FineTune_sklearn(self, X, y, mute=False, n_splits=10, n_iter=2): """ https://www.kaggle.com/ksaaskil/pets-definitive-catboost-tuning """ self.dataset(X, y) def build_search(modelo, param_distributions, cv=5, n_iter=10, verbose=1, random_state=99): """ Builder function for RandomizedSearch. """ QWS = make_scorer(cohen_kappa_score, weights='quadratic') return RandomizedSearchCV(modelo, param_distributions=param_distributions, cv=cv, return_train_score=True, refit='cohen_kappa_quadratic', n_iter=n_iter, n_jobs=None, scoring={ 'accuracy': make_scorer(accuracy_score), 'cohen_kappa_quadratic': QWS }, verbose=verbose, random_state=random_state) def pretty_cv_results(cv_results, sort_by='rank_test_cohen_kappa_quadratic', sort_ascending=True, n_rows=30): """ Return pretty Pandas dataframe from the `cv_results_` attribute of finished parameter search, ranking by test performance and only keeping the columns of interest. """ df = pd.DataFrame(cv_results) cols_of_interest = [ key for key in df.keys() if key.startswith('param_') or key.startswith("mean_train") or key.startswith("std_train") or key.startswith("mean_test") or key.startswith("std_test") or key.startswith('mean_fit_time') or key.startswith('rank') ] return df.loc[:, cols_of_interest].sort_values( by=sort_by, ascending=sort_ascending).head(n_rows) def run_search(X_train, y_train, search, mute=False): search.fit(X_train, y_train) print('Best score is:', search.best_score_) return pretty_cv_results(search.cv_results_) param_distributions = { 'iterations': [100, 200], 'learning_rate': scipy.stats.uniform(0.01, 0.3), 'max_depth': scipy.stats.randint(3, 10), 'one_hot_max_size': [30], 'l2_leaf_reg': scipy.stats.reciprocal(a=1e-2, b=1e1), } if mute: _verbose = 0 else: _verbose = 1 self.params.update({'use_best_model': False}) _model = CatBoostClassifier(**self.params) catboost_search = build_search(_model, param_distributions=param_distributions, n_iter=n_iter, verbose=_verbose, cv=RepeatedStratifiedKFold( n_splits=n_splits, n_repeats=1, random_state=self.random_state)) catboost_cv_results = run_search(self.X, self.y, search=catboost_search, mute=mute) best_estimator = catboost_search.best_estimator_ if not mute: print(best_estimator.get_params()) return catboost_cv_results, best_estimator def __getattr__(self, attr): """ Pass all other method calls to self.model. """ return getattr(self.model, attr)
def get_params_json(self): self.manager_models = ParamsManager(param_file, key_read="Models") self.params = self.manager_models.get_params()["XGBoost"] self.manager_finetune = ParamsManager(param_file, key_read="FineTune") self.params_finetune = self.manager_finetune.get_params()["XGBoost"]
class modelXGBoost(Training, BaseEstimator, ClassifierMixin): """ XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples. Parameters ---------- "min_child_weight": [ Minimum sum of instance weight (hessian) needed in a child. "objective": learning task. "eval_metric": Evaluation metrics for validation data. "max_depth": Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit "max_delta_step": /Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint. "sampling_method": The method to use to sample the training instances. "subsample": Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. "eta": tep size shrinkage used in update to prevents overfitting. "gamma": Minimum loss reduction required to make a further partition on a leaf node of the tree. "lambda": L2 regularization term on weights. Increasing this value will make model more conservative. "alpha": L1 regularization term on weights. Increasing this value will make model more conservative. "tree_method": he tree construction algorithm used in XGBoost. "predictor": The type of predictor algorithm to use. "num_parallel_tree": umber of parallel trees constructed during each iteration. ... Documentation ------------- https://xgboost.readthedocs.io/en/latest/ https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ """ def __init__(self, name="XGB", random_state=99, train_dir="", params=None, *args, **kwargs): self.name = name self.train_dir = train_dir + "/" + "model_" + str(self.name) + "/" self.random_state = random_state if params is None: self.get_params_json() self.params.update({ 'model_dir': self.train_dir, "seed": self.random_state }) else: # if isinstance(params) self.params = params self.model = XGBClassifier(**self.params) super().__init__(self.model, random_state=self.random_state) def get_params_json(self): self.manager_models = ParamsManager(param_file, key_read="Models") self.params = self.manager_models.get_params()["XGBoost"] self.manager_finetune = ParamsManager(param_file, key_read="FineTune") self.params_finetune = self.manager_finetune.get_params()["XGBoost"] def dataset(self, X, y, categorical_columns_indices=None, test_size=0.2, *args, **kwarg): self.categorical_columns_indices = categorical_columns_indices self.X = X self.columns = list(X) self.y, self.cat_replace = self.replace_multiclass(y) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=test_size, random_state=self.random_state) self.dtrain = xgb.DMatrix(self.X_train, label=self.y_train) self.dvalid = xgb.DMatrix(self.X_test, label=self.y_test) self.all_train_data = xgb.DMatrix(self.X, label=self.y) def set_dataset_nosplit(self, X_train, X_test, y_train, y_test, categorical_columns_indices=None, *args, **kwarg): self.categorical_columns_indices = categorical_columns_indices self.columns = list(X_train) _ytrain, _ = self.replace_multiclass(y_train) _ytest, _ = self.replace_multiclass(y_test) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.X = pd.concat([X_train, X_test], axis=0) self.y = pd.concat([y_train, y_test], axis=0) self.dtrain = xgb.DMatrix(self.X_train, label=self.y_train) self.dvalid = xgb.DMatrix(self.X_test, label=self.y_test) self.all_train_data = xgb.DMatrix(self.X, label=self.y) def replace_multiclass(self, targets): _unic = targets.unique().tolist() _remp = np.arange(0, len(_unic)).tolist() return targets.replace(_unic, _remp), _unic def fit(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, mute=False, use_best_model=True, verbose=0, num_boost_round=100, nosplit=False, **kwargs): if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbosity': verbose}) self.model = xgb.train( self.params, self.dtrain, num_boost_round=num_boost_round, # verbosity=verbose, **kwargs) _preds = self.model.predict(self.dvalid) preds_test = np.where(_preds > 0.5, 1, 0) score_test = accuracy_score(self.y_test, preds_test) _preds = self.model.predict(self.dtrain) preds_train = np.where(_preds > 0.5, 1, 0) score_train = accuracy_score(self.y_train, preds_train) if not mute: print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%". format(score_train * 100)) print("Accurancy para el conjunto de validacion ------> {:.2f}%". format(score_test * 100)) def fit_cv(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, num_boost_round=75, nfold=5, use_best_model=True, verbose=2, nosplit=False, early_stopping_rounds=75, **kwargs): """ https://xgboost.readthedocs.io/en/latest/parameter.html """ if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbose_eval': verbose}) self.xgb_cv = xgb.cv(self.params, self.all_train_data, num_boost_round, nfold, early_stopping_rounds=early_stopping_rounds, stratified=True, seed=self.random_state) loss = "test-" + self.params["metrics"][0] optimal_rounds = np.argmin(self.xgb_cv[str(loss) + '-mean']) best_cv_score = max(self.xgb_cv[str(loss) + '-mean']) if not verbose == 0: print("\nOptimal Round: {}\nOptimal Score: {:.3f} + std:{:.3f}". format(optimal_rounds, best_cv_score, self.xgb_cv[str(loss) + '-std'][optimal_rounds])) results = { "Rounds": optimal_rounds, "Score": best_cv_score, "STDV": self.xgb_cv[str(loss) + '-std'][optimal_rounds], "LB": None, "Parameters": self.params } score = self.xgb_cv[str(loss) + '-mean'].mean() return score, results def func_acc(self, prob_pred, y_target): _y_pred = np.zeros(len(prob_pred)) for i in range(0, len(prob_pred)): _y_pred[i] = int(np.argmax(prob_pred[i])) accuracy = accuracy_score(_y_pred, y_target) return accuracy def predict(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict(xgb.DMatrix(_X_copy), *args, **kwargs) def pred_binary(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() preds = self.model.predict(xgb.DMatrix(_X_copy), *args, **kwargs) return np.where(preds > 0.5, 1, 0) def pred_multiclass(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return [ np.argmax(line) for line in self.model.predict(xgb.DMatrix(_X_copy)) ] def update_model(self, **kwargs): for k, v in kwargs.items(): setattr(self.model, k, v) def save_model(self, direct="./checkpoints", name="XGB_model", file_model=".txt"): if not os.path.isdir(direct): try: os.mkdir(direct) print("Directorio creado: " + direct) except OSError as e: raise NameError("Error al crear el directorio") current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") if file_model == ".txt": filename = direct + "/" + name + "_" + current_time + ".txt" self.model.save_model(filename) elif file_model == ".pkl": filename = direct + "/" + name + "_" + current_time + ".pkl" joblib.dump(self.model, filename) else: raise NameError("Type {} not permited".format(file_model)) print("Modelo guardado en la ruta: " + filename) def load_model(self, direct="./checkpoints/XGB_model.txt", file_model=".txt"): if not os.path.isdir(direct): print("no existe el drectorio especificado") if file_model == ".txt": self.model = XGBClassifier(model_file=direct) elif file_model == ".pkl": self.model = joblib.load(direct) else: raise NameError("Type {} not permited".format(file_model)) print("Modelo cargado de la ruta: " + direct) def index_features(self, features): _index = [] for i in features: _index.append(self.X.columns.get_loc(i)) if _index == []: raise NameError("No coincide ninguna de las features introducidas") return _index def get_important_features(self, display=True, max_num_features=20): _model = XGBClassifier() _model.fit(self.X, self.y) _data = np.array([self.X.columns, _model.feature_importances_]) _feature_importance_df = pd.DataFrame( _data.T, columns=["Feature Id", "Importances"]) _feature_importance_df = _feature_importance_df.sort_values( by=['Importances'], ascending=False) if display: plt.figure(figsize=(12, 6)) sns.barplot(x="Importances", y="Feature Id", data=_feature_importance_df) plt.title('XGBoost features importance:') # if display: # xgb.plot_importance(self.model, max_num_features=max_num_features, figsize=(6, 6), title='Feature importance (LightGBM)') # plt.show() return _feature_importance_df def FineTune_SearchCV(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, params=None, params_finetune=None, ROC=False, randomized=True, cv=10, n_iter=10, replace_model=True, verbose=0, nosplit=False, finetune_dir=""): self.get_params_json() self.finetune_dir = finetune_dir + "/" + "model_finetune_" + str( self.name) + "/" self.params.update({ 'train_dir': self.finetune_dir, "seed": self.random_state }) if params is not None: self.params = params if params_finetune is not None: self.params_finetune = params_finetune if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbosity': verbose}) self.model = XGBClassifier(**self.params) self._best_Parameters, self.results_df = self.FineTune( self.model, self.X_train, self.y_train, self.params_finetune, randomized=True, cv=cv, n_iter=n_iter, verbose=1) self.params.update(**self._best_Parameters) self.fit(self.X_train, self.y_train, verbose=1) score = accuracy_score(self.y_test, self.pred_multiclass(self.X_test)) print( "Resultado del conjunto de test con los parametros optimos: {:.2f}%" .format(score * 100)) print("\n") print("Report clasificacion con el conjunto de test: ") self.evaluate(self.model, xgb.DMatrix(self.X_test), self.y_test) print("\n") print("Cross validation con todos los datos del dataset: ") print("\n") self.KFold_CrossValidation(XGBClassifier(**self._best_Parameters), self.X_test, self.y_test, n_splits=cv, ROC=ROC, shuffle=True, mute=False, logdir_report="", display=True, save_image=True, verbose=0) return self._best_Parameters, self.results_df def SeedDiversification_cv(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, n_iter=10, n_max=2018 - 2022, cv=10, nosplit=False, finetuneseed_dir="", display=True, save_image=True, verbose=0): allmodelstart = time.time() self.get_params_json() self.finetune_dir = finetuneseed_dir + "/" + "model_finetune_seed" + str( self.name) + "/" self.params.update({ 'train_dir': self.finetune_dir, 'verbosity': verbose }) if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbosity': verbose}) self.model = XGBClassifier(**self.params) _rd = np.random.uniform(0, n_max, n_iter).astype(np.int32).tolist() params_finetuneseed = {"seed": _rd} del (_rd) self._best_Parameters, self.results_df = self.FineTune( self.model, self.X, self.y, params_finetuneseed, randomized=False, cv=cv, n_iter=n_iter, verbose=1, mute=True) print("All Model Runtime: %0.2f Minutes" % ((time.time() - allmodelstart) / 60)) print( "Diversificacion de la semilla - mean AUC: {:.2f}% - std AUC: {:.5f}" .format(self.results_df['mean_test_AUC'].mean() * 100, self.results_df['std_test_AUC'].mean())) print( "Diversificacion de la semilla - mean Acc: {:.2f}% - std Acc: {:.5f}" .format(self.results_df['mean_test_Accuracy'].mean() * 100, self.results_df['std_test_Accuracy'].mean())) return self._best_Parameters, self.results_df def __getattr__(self, attr): """ Pass all other method calls to self.model. """ return getattr(self.model, attr)
def __init__(self, random_state=99): """ Inicializacion de la clase de Preprocesado de un dataframe. """ self.random_state = random_state self.manager_models = ParamsManager(param_file, key_read="Models")
class PipelineClasificators(Training): def __init__(self, random_state=99): """ Inicializacion de la clase de modelos """ self.random_state = random_state self.manager_models = ParamsManager(param_file, key_read="Models") self.manager_finetune = ParamsManager(param_file, key_read="FineTune") def add_model(self, model): self.model = model def KNearestNeighbors(self): """ """ self.KNN = KNeighborsClassifier() return self.KNN def NaiveBayes(self): """ Naive Bayes assumes the data to be normally distributed which can be achieved by scaling using the MaxAbsScaler. """ self.NB = GaussianNB() return self.NB def RandomForestClassifier(self): """ n_jobs: Parelizacion en la computacion. oob_score: True, muestreo aleatorio. n_estimadores = numero de arboles en el bosque max_features = numero maximo de caracteristicas consideradas para dividir un nodo max_depth = numero maximo de niveles en cada arbol de decision min_samples_split = numero minimo de puntos de datos colocados en un nodo antes de que el nodo se divida min_samples_leaf = numero minimo de puntos de datos permitidos en un nodo hoja bootstrap = metodo para muestrear puntos de datos (con o sin reemplazo) """ self.modelRF = RandomForestClassifier( n_estimators=self.manager_models.get_params() ["RandomForestClassifier"]["n_estimators"], criterion=self.manager_models.get_params() ["RandomForestClassifier"]["criterion"], max_depth=self.manager_models.get_params() ["RandomForestClassifier"]["max_depth"], min_samples_split=self.manager_models.get_params() ["RandomForestClassifier"]["min_samples_split"], min_samples_leaf=self.manager_models.get_params() ["RandomForestClassifier"]["min_samples_leaf"], min_weight_fraction_leaf=self.manager_models.get_params() ["RandomForestClassifier"]["min_weight_fraction_leaf"], max_features=self.manager_models.get_params() ["RandomForestClassifier"]["max_features"], min_impurity_decrease=self.manager_models.get_params() ["RandomForestClassifier"]["min_impurity_decrease"], bootstrap=self.manager_models.get_params() ["RandomForestClassifier"]["bootstrap"], oob_score=self.manager_models.get_params() ["RandomForestClassifier"]["oob_score"], n_jobs=self.manager_models.get_params()["RandomForestClassifier"] ["n_jobs"], random_state=self.random_state, verbose=self.manager_models.get_params()["RandomForestClassifier"] ["verbose"], warm_start=self.manager_models.get_params() ["RandomForestClassifier"]["warm_start"], # ccp_alpha=self.manager_models.get_params()["RandomForestClassifier"]["ccp_alpha"] ) return self.modelRF def AdaBoostClassifier(self, **params): from sklearn.ensemble import AdaBoostClassifier return AdaBoostClassifier(random_state=self.random_state, **params) def GradientBoostingClassifier(self, **params): from sklearn.ensemble import GradientBoostingClassifier return GradientBoostingClassifier(random_state=self.random_state, **params) def ExtraTreesClassifier(self, **params): from sklearn.ensemble import ExtraTreesClassifier return ExtraTreesClassifier(random_state=self.random_state, **params) def SupportVectorMachine(self, **params): """ """ self.SVM = SVC(**params) return self.SVM def XGBoost(self, name="CBT"): """ "min_child_weight": [ Minimum sum of instance weight (hessian) needed in a child. "objective": learning task. "eval_metric": Evaluation metrics for validation dat. "max_depth": Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit "max_delta_step": /Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint. "sampling_method": The method to use to sample the training instances. "subsample": Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. "eta": tep size shrinkage used in update to prevents overfitting. "gamma": Minimum loss reduction required to make a further partition on a leaf node of the tree. "lambda": L2 regularization term on weights. Increasing this value will make model more conservative. "alpha": L1 regularization term on weights. Increasing this value will make model more conservative. "tree_method": he tree construction algorithm used in XGBoost. "predictor": The type of predictor algorithm to use. "num_parallel_tree": umber of parallel trees constructed during each iteration. https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ """ self.modelXGBoost = modelXGBoost(name=name, random_state=self.random_state) return self.modelXGBoost def LightBoost(self, name="LBT"): self.LBoost = modelLightBoost(name=name, random_state=self.random_state) return self.LBoost def CatBoost(self, name="CBT"): self.CBoost = modelCatBoost(name=name, random_state=self.random_state) return self.CBoost def append_summary(self, model, X_train, X_test, y_train, y_test, name): train_start = time.perf_counter() score, _, _ = self.eval_Kfold_CV(model, X_train, X_test, y_train, y_test, n_splits=self.n_splits, shuffle=True, mute=True) train_end = time.perf_counter() prediction_start = time.perf_counter() _ = model.predict(X_test) prediction_end = time.perf_counter() self.names.append(name) self.utrain.append(train_end - train_start) self.utimes.append(prediction_end - prediction_start) return score def Pipeline_SelectModel(self, X, y, n_splits=5, select="XGBoost"): # Lista de modelos a optimizar self.scores = [] self.names = [] self.utrain = [] self.utimes = [] self.n_splits = n_splits self.features = X.columns.tolist() # Conjunto de entrenamiento y de test X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=self.random_state) # KNearestNeighbors if self.manager_models.get_params()["select_models"]["KNN"]: _model = self.KNearestNeighbors() score = self.append_summary(_model, X_train, X_test, y_train, y_test, "KNearestNeighbors") self.scores.append(score) if select == "KNearestNeighbors": self.add_model(_model) print("Modelo: KNearestNeighbors --> Mean Accuracy: {:.3f}%\n". format(score * 100)) # NaiveBayes if self.manager_models.get_params()["select_models"]["NaiveBayes"]: from sklearn.preprocessing import MaxAbsScaler _model = self.NaiveBayes() scaler_gnb = MaxAbsScaler() sdss_train = scaler_gnb.fit_transform(X_train) sdss_test = scaler_gnb.fit_transform(X_test) pd_sdss_train = pd.DataFrame(columns=X_train.columns.tolist(), data=sdss_train) pd_sdss_test = pd.DataFrame(columns=X_test.columns.tolist(), data=sdss_test) score = self.append_summary(_model, pd_sdss_train, pd_sdss_test, y_train, y_test, "NaiveBayes") self.scores.append(score) if select == "NaiveBayes": self.add_model(_model) print("Modelo: NaiveBayes --> Mean Accuracy: {:.3f}%\n".format( score * 100)) # SupportVectorMachine if self.manager_models.get_params()["select_models"]["SVM"]: _model = self.SupportVectorMachine() score = self.append_summary(_model, X_train, X_test, y_train, y_test, "SupportVectorMachine") self.scores.append(score) if select == "SupportVectorMachine": self.add_model(_model) print("Modelo: SupportVectorMachine --> Mean Accuracy: {:.3f}%\n". format(score * 100)) # RandomForestClassifier if self.manager_models.get_params( )["select_models"]["RandomForestClassifier"]: _model = self.RandomForestClassifier() score = self.append_summary(_model, X_train, X_test, y_train, y_test, "RandomForestClassifier") self.scores.append(score) if select == "RandomForestClassifier": self.add_model(_model) print( "Modelo: RandomForestClassifier --> Mean Accuracy: {:.3f}%\n". format(score * 100)) # XGBoost if self.manager_models.get_params()["select_models"]["XGBoost"]: _model = self.XGBoost(name="XGBoost") _model.fit(X, y, verbose=0, mute=True) # score = self.append_summary(_model, X_train, X_test, y_train, y_test, "XGBoost") train_start = time.perf_counter() score, _ = _model.fit_cv(X, y, nfold=n_splits, verbose=0) self.scores.append(score) train_end = time.perf_counter() prediction_start = time.perf_counter() _ = _model.predict(X_test) prediction_end = time.perf_counter() if select == "XGBoost": self.add_model(_model) print("Modelo: XGBoost --> Mean Accuracy: {:.3f}%\n".format(score * 100)) self.names.append("XGBoost") self.utrain.append(train_end - train_start) self.utimes.append(prediction_end - prediction_start) # LightGBM if self.manager_models.get_params()["select_models"]["LightGBM"]: _model = self.LightBoost(name="LBT") _model.fit(X, y, verbose=0, mute=True) train_start = time.perf_counter() score, _ = _model.fit_cv(X, y, nfold=n_splits, verbose=0) self.scores.append(score) train_end = time.perf_counter() prediction_start = time.perf_counter() _ = _model.predict(X_test) prediction_end = time.perf_counter() if select == "LightGBM": self.add_model(_model.model) print("Modelo: LightGBM --> Mean Accuracy: {:.3f}%\n".format( score * 100)) self.names.append("LightGBM") self.utrain.append(train_end - train_start) self.utimes.append(prediction_end - prediction_start) # CatBoost if self.manager_models.get_params()["select_models"]["CatBoost"]: _model = self.CatBoost(name="CBT") train_start = time.perf_counter() score = _model.fit_cv(X, y, fold_count=self.n_splits, shuffle=True, stratified=True, plot=False, verbose=0) self.scores.append(np.mean(score["test-Accuracy-mean"])) train_end = time.perf_counter() _model.fit(X, y, plot=False, verbose=0) prediction_start = time.perf_counter() _model.model.predict(_model.eval_data) prediction_end = time.perf_counter() if select == "CatBoost": self.add_model(_model.model) print("Modelo: CatBoost --> Mean Accuracy: {:.3f}%\n".format( np.mean(score["test-Accuracy-mean"]) * 100)) self.names.append("CatBoost") self.utrain.append(train_end - train_start) self.utimes.append(prediction_end - prediction_start) resultados = pd.DataFrame({ "Modelo": self.names, "Mean Accuracy": self.scores, "Tiempo Entrenamiento": self.utrain, "Tiempo Prediccion": self.utimes }) return resultados def Pipeline_SelectEmsembleModel(self, X, y, n_splits=10, mute=False, scoring="accuracy", display=True, save_image=False, path="/", AB=True): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=self.random_state) X_train, y_train = X, y ensembles = [] if AB: ensembles.append(('AB', self.AdaBoostClassifier())) ensembles.append(('GBM', self.GradientBoostingClassifier())) ensembles.append(('ET', self.ExtraTreesClassifier())) ensembles.append( ('RF', RandomForestClassifier(random_state=self.random_state))) ensembles.append( ('XGB', XGBClassifier(random_state=self.random_state))) ensembles.append( ('LGBM', LGBMClassifier(random_state=self.random_state))) results = [] names = [] for name, model in ensembles: kfold = StratifiedKFold(n_splits=n_splits, random_state=self.random_state) cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) # resultados = zip(name, results) if not mute: print(msg) scores = pd.DataFrame(np.asarray(results).T, columns=names) if display: figure, axs = plt.subplots(1, 2, figsize=(16, 5)) ax = axs.flatten() # Compare Algorithms ax[0].set_title('Ensemble Algorithm Comparison') ax[0].boxplot(results) ax[0].set_xticklabels(names) if AB: axis = ["AB", "GBM", "ET", "RF", "XGB", "LGM"] else: axis = ["GBM", "ET", "RF", "XGB", "LGM"] scores_mean = np.mean(scores, axis=0) scores_std = np.std(scores, axis=0) ax[1].grid() ax[1].fill_between(axis, scores_mean - scores_std, scores_mean + scores_std, alpha=0.1, color="r") ax[1].plot(axis, scores_mean, 'o-', color="r", label="CV score") ax[1].legend(loc="best") ax[1].set_title('Cross-validation score') figure.tight_layout() plt.show() if save_image: plt.savefig(path) return scores def Pipeline_FeatureSelect(self, X, y, n_splits=10, mute=False, scoring="accuracy", n_features=20, display=True, save_image=False, path="/"): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=self.random_state) X_train, y_train = X, y models = [] models.append(('GBM', self.GradientBoostingClassifier())) models.append(('ET', self.ExtraTreesClassifier())) models.append( ('RF', RandomForestClassifier(random_state=self.random_state))) models.append(('XGB', XGBClassifier(random_state=self.random_state))) models.append(('LGBM', LGBMClassifier(random_state=self.random_state))) results = [] names = [] df = pd.DataFrame() for name, model in models: if not mute: print("modelo: {}".format(name)) if not mute: print(".... Fitting") model.fit(X_train, y_train) if not mute: print(".... Permutation importance") result = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=99) tree_importance_sorted_idx = np.argsort(model.feature_importances_) _ = np.arange(0, len(model.feature_importances_)) + 0.5 name_features = "features_" + str(name) imp_features = "importance" + str(name) df[name_features] = X.columns[tree_importance_sorted_idx] df[imp_features] = model.feature_importances_[ tree_importance_sorted_idx] features = df[name_features].values.tolist()[-n_features:] _X_train = X_train[features] _y_train = y_train if not mute: print(".... Select Features:") print(features) if not mute: print(".... Cross Validation") kfold = StratifiedKFold(n_splits=n_splits, random_state=99) cv_results = cross_val_score(model, _X_train, _y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) # resultados = zip(name, results) if not mute: print(".... Append Results:") print(msg) print("\n") scores = pd.DataFrame(np.asarray(results).T, columns=names) if display: figure, axs = plt.subplots(1, 2, figsize=(16, 5)) ax = axs.flatten() # Compare Algorithms ax[0].set_title('Algorithm Comparison') ax[0].boxplot(results) ax[0].set_xticklabels(names) axis = ["GBM", "ET", "RF", "XGB", "LGM"] scores_mean = np.mean(scores, axis=0) scores_std = np.std(scores, axis=0) ax[1].grid() ax[1].fill_between(axis, scores_mean - scores_std, scores_mean + scores_std, alpha=0.1, color="r") ax[1].plot(axis, scores_mean, 'o-', color="r", label="CV score") ax[1].legend(loc="best") ax[1].set_title('Cross-validation score') figure.tight_layout() plt.show() if save_image: plt.savefig(path) return scores, df def Pipeline_StackingClassifier(self, X, y, n_splits=5): # Lista de modelos self.models = [] # KNearestNeighbors if self.manager_models.get_params()["stacking_models"]["KNN"]: _model = self.KNearestNeighbors() self.models.append(("KNearestNeighbors", _model)) # NaiveBayes if self.manager_models.get_params()["stacking_models"]["NaiveBayes"]: _model = self.NaiveBayes() self.models.append(("NaiveBayes", _model)) # SupportVectorMachine if self.manager_models.get_params()["stacking_models"]["SVM"]: _model = self.SupportVectorMachine() self.models.append(("SupportVectorMachine", _model)) # RandomForestClassifier if self.manager_models.get_params( )["stacking_models"]["RandomForestClassifier"]: _model = self.RandomForestClassifier() self.models.append(("RandomForestClassifier", _model)) # XGBoost if self.manager_models.get_params()["stacking_models"]["XGBoost"]: _model = self.XGBoost(name="XGBoost") self.models.append(("XGBoost", _model)) # LightGBM if self.manager_models.get_params()["stacking_models"]["LightGBM"]: _model = self.LightBoost(name="LBT") self.models.append(("LightGBM", _model)) # CatBoost if self.manager_models.get_params()["stacking_models"]["CatBoost"]: _model = self.CatBoost(name="CBT") self.models.append(("CatBoost", _model)) def _cv_results(self, X_train, Y_train, model, kfold, name, verbose=1): cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) if verbose > 0: print(msg) return cv_results def Ablacion_relativa(self, pipeline, X, y, n_splits=10, mute=False, std=True, scoring="accuracy", display=True, save_image=False, path="/"): kfold = StratifiedKFold(n_splits=n_splits, random_state=99) models = [] models.append(('AB', self.AdaBoostClassifier())) models.append(('GBM', self.GradientBoostingClassifier())) models.append( ('RF', RandomForestClassifier(random_state=self.random_state))) models.append(('ET', self.ExtraTreesClassifier())) models.append(('LGM', LGBMClassifier(random_state=self.random_state))) models.append(('XGB', XGBClassifier(random_state=self.random_state))) models.append(('SVM', self.SupportVectorMachine())) models.append(('KNN', self.KNearestNeighbors())) scores_mean = [] scores_std = [] names_models = [] for name_model, model in models: names = [] results = [] name = "Inicial" if not mute: print("\n", name_model) resu = self._cv_results(X, y, model, kfold, name) results.append(resu) names.append(name) for name, transf in pipeline: X_train = transf.fit_transform(X, y) Y_train = y resu = self._cv_results(X_train, Y_train, model, kfold, name) results.append(resu) names.append(name) scores = pd.DataFrame(np.asarray(results).T, columns=names) scores_mean.append(np.mean(scores, axis=0)) scores_std.append(np.std(scores, axis=0)) names_models.append(name_model) if display: fig, ax = plt.subplots(figsize=(14, 6)) for i in range(len(scores_mean)): valor = scores_mean[i] - scores_mean[i].iloc[0] if std: ax.fill_between(names, valor - scores_std[i], valor + scores_std[i], alpha=0.1) ax.plot(names, valor, 'o-', label=names_models[i], alpha=0.9) ax.plot(names, np.zeros(len(names)), 'ro--', label="cero", alpha=0.9) ax.grid() ax.legend(loc="best") ax.set_title('Mejoras relativas al modelo Inicial') fig.tight_layout() fig.show() if save_image: plt.savefig(path) return scores_mean, scores_std def features_importances(self, clf, X, y, display=True, save_image=False, path="/"): import seaborn as sns X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, stratify=y) clf.fit(X_train, y_train) print("Accuracy on test data: {:.2f}".format(clf.score(X_test, y_test))) result = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=99) tree_importance_sorted_idx = np.argsort(clf.feature_importances_) _ = np.arange(0, len(clf.feature_importances_)) + 0.5 df = pd.DataFrame() df["feature"] = X.columns[tree_importance_sorted_idx] df["importance"] = clf.feature_importances_[tree_importance_sorted_idx] if display: _, _ = plt.subplots(figsize=(10, 30)) sns.barplot(x="importance", y="feature", data=df.sort_values(by="importance", ascending=False)) plt.title('Features (avg over folds)') plt.show() if save_image: plt.savefig(path) return df def eval_Kfold_CV(self, model, X, X_test, y, y_test, n_splits=3, shuffle=True, mute=True): _Model = Training.add_model(model) resultados, score_general_test = _Model.KFold_CrossValidation( model, X, y, n_splits=n_splits, shuffle=shuffle, mute=mute) _predictions = model.predict(X_test) score = accuracy_score(y_true=y_test, y_pred=_predictions) return score, resultados, score_general_test def func_acc(self, prob_pred, y_target): _y_pred = np.zeros(len(prob_pred)) for i in range(0, len(prob_pred)): _y_pred[i] = int(np.argmax(prob_pred[i])) accuracy = accuracy_score(_y_pred, y_target) return accuracy def pred_binary(self, prob_pred, y_target, th=0.5): return accuracy_score(y_target, np.where(prob_pred > th, 1, 0)) def replace_multiclass(self, targets): _unic = targets.unique().tolist() _remp = np.arange(0, len(_unic)).tolist() return targets.replace(_unic, _remp), _unic def Pipeline_GridSearch(self): pass
class modelLightBoost(Training, BaseEstimator, ClassifierMixin): """ Ejemplo multiclass: https://www.kaggle.com/nicapotato/multi-class-lgbm-cv-and-seed-diversification """ def __init__(self, name="LGB", random_state=99, train_dir="", params=None, *args, **kwargs): self.name = name self.train_dir = train_dir + "/" + "model_" + str(self.name) + "/" self.random_state = random_state if params is None: self.get_params_json() self.params.update({ 'model_dir': self.train_dir, "seed": self.random_state }) else: # if isinstance(params) self.params = params self.model = LGBMClassifier(**self.params) super().__init__(self.model, random_state=self.random_state) def get_params_json(self): self.manager_models = ParamsManager(param_file, key_read="Models") self.params = self.manager_models.get_params()["LightBoost"] self.manager_finetune = ParamsManager(param_file, key_read="FineTune") self.params_finetune = self.manager_finetune.get_params()["LightBoost"] def dataset(self, X, y, categorical_columns_indices=None, test_size=0.2, *args, **kwarg): self.categorical_columns_indices = categorical_columns_indices self.X = X self.columns = list(X) self.y, self.cat_replace = self.replace_multiclass(y) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=test_size, random_state=self.random_state) self.dtrain = lgb.Dataset(self.X_train.values, label=self.y_train.values, feature_name=self.X_train.columns.tolist()) self.dvalid = lgb.Dataset(self.X_test.values, label=self.y_test.values, feature_name=self.X_test.columns.tolist()) self.all_train_data = lgb.Dataset(self.X.values, label=self.y.values, feature_name=self.X.columns.tolist()) def set_dataset_nosplit(self, X_train, X_test, y_train, y_test, categorical_columns_indices=None, *args, **kwarg): self.categorical_columns_indices = categorical_columns_indices self.columns = list(X_train) _ytrain, _ = self.replace_multiclass(y_train) _ytest, _ = self.replace_multiclass(y_test) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.X = pd.concat([X_train, X_test], axis=0) self.y = pd.concat([y_train, y_test], axis=0) self.dtrain = lgb.Dataset(self.X_train.values, label=self.y_train.values, feature_name=self.X_train.columns.tolist()) self.dvalid = lgb.Dataset(self.X_test.values, label=self.y_test.values, feature_name=self.X_test.columns.tolist()) self.all_train_data = lgb.Dataset(self.X.values, label=self.y.values, feature_name=self.X.columns.tolist()) def replace_multiclass(self, targets): _unic = targets.unique().tolist() _remp = np.arange(0, len(_unic)).tolist() return targets.replace(_unic, _remp), _unic def fit(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, mute=False, use_best_model=True, verbose=0, num_boost_round=100, nosplit=False, **kwargs): if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbose': verbose}) self.model = lgb.train(self.params, self.dtrain, num_boost_round=num_boost_round, verbose_eval=verbose, **kwargs) preds_test = [ np.argmax(line) for line in self.model.predict( self.X_test, num_iteration=self.model.best_iteration) ] score_test = accuracy_score(self.y_test, preds_test) preds_train = [ np.argmax(line) for line in self.model.predict( self.X_train, num_iteration=self.model.best_iteration) ] score_train = accuracy_score(self.y_train, preds_train) if not mute: print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%". format(score_train * 100)) print("Accurancy para el conjunto de validacion ------> {:.2f}%". format(score_test * 100)) def fit_cv(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, nfold=5, use_best_model=True, verbose=200, nosplit=False, early_stopping_rounds=150, num_boost_round=2000, **kwargs): if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbose': verbose}) self.lgb_cv = lgb.cv(params=self.params, train_set=self.all_train_data, num_boost_round=num_boost_round, stratified=True, nfold=nfold, seed=self.random_state, early_stopping_rounds=early_stopping_rounds, **kwargs) loss = self.params["metric"] optimal_rounds = np.argmin(self.lgb_cv[str(loss) + '-mean']) best_cv_score = min(self.lgb_cv[str(loss) + '-mean']) if not verbose == 0: print("\nOptimal Round: {}\nOptimal Score: {:.3f} + stdv:{:.3f}". format(optimal_rounds, best_cv_score, self.lgb_cv[str(loss) + '-stdv'][optimal_rounds])) results = { "Rounds": optimal_rounds, "Score": best_cv_score, "STDV": self.lgb_cv[str(loss) + '-stdv'][optimal_rounds], "LB": None, "Parameters": self.params } score = np.mean(self.lgb_cv[str(loss) + '-mean']) return score, results def func_acc(self, prob_pred, y_target): _y_pred = np.zeros(len(prob_pred)) for i in range(0, len(prob_pred)): _y_pred[i] = int(np.argmax(prob_pred[i])) accuracy = accuracy_score(_y_pred, y_target) return accuracy def pred_binary(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() preds = self.model.predict(_X_copy, *args, **kwargs) return np.where(preds > 0.5, 1, 0) def pred_multiclass(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return [ np.argmax(line) for line in self.model.predict( _X_copy, num_iteration=self.model.best_iteration) ] def update_model(self, **kwargs): for k, v in kwargs.items(): setattr(self.model, k, v) def save_model(self, direct="./checkpoints", name="LGM_model", file_model=".txt"): if not os.path.isdir(direct): try: os.mkdir(direct) print("Directorio creado: " + direct) except OSError as e: raise NameError("Error al crear el directorio") current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") if file_model == ".txt": filename = direct + "/" + name + "_" + current_time + ".txt" self.model.save_model(filename) elif file_model == ".pkl": filename = direct + "/" + name + "_" + current_time + ".pkl" joblib.dump(self.model, filename) else: raise NameError("Type {} not permited".format(file_model)) print("Modelo guardado en la ruta: " + filename) def load_model(self, direct="./checkpoints/LGM_model.txt", file_model=".txt"): if not os.path.isdir(direct): print("no existe el drectorio especificado") if file_model == ".txt": self.model = LGBMClassifier(model_file=direct) elif file_model == ".pkl": self.model = joblib.load(direct) else: raise NameError("Type {} not permited".format(file_model)) print("Modelo cargado de la ruta: " + direct) def predict(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict(_X_copy, *args, **kwargs) def predict_proba(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict_proba(_X_copy, *args, **kwargs) def index_features(self, features): _index = [] for i in features: _index.append(self.X.columns.get_loc(i)) if _index == []: raise NameError("No coincide ninguna de las features introducidas") return _index def get_important_features(self, display=True, max_num_features=20): if display: lgb.plot_importance(self.model, max_num_features=max_num_features, figsize=(6, 6), title='Feature importance (LightGBM)') plt.show() # return _feature_importance_df def FineTune_SearchCV(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, params=None, params_finetune=None, ROC=False, randomized=True, cv=10, display_ROC=True, verbose=0, n_iter=10, replace_model=True, nosplit=False, finetune_dir=""): self.get_params_json() self.finetune_dir = finetune_dir + "/" + "model_finetune_" + str( self.name) + "/" self.params.update({ 'train_dir': self.finetune_dir, "seed": self.random_state }) if params is not None: self.params = params if params_finetune is not None: self.params_finetune = params_finetune if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbosity': verbose}) self.model = LGBMClassifier(**self.params) self._best_Parameters, self.results_df = self.FineTune( self.model, self.X_train, self.y_train, self.params_finetune, cv=cv, randomized=True, n_iter=n_iter, verbose=1) self.params.update(**self._best_Parameters) self.fit(self.X_train, self.y_train) print("\n") score = accuracy_score(self.y_test, self.pred_multiclass(self.X_test)) print("\n") print( "Resultado del conjunto de test con los parametros optimos: {:.2f}%" .format(score * 100)) print("\n") print("Report clasificacion con el conjunto de test: ") self.evaluate(self.model, self.X_test, self.y_test) print("\n") print("Validacion cruzada con todos los datos del dataset: ") print("\n") self.KFold_CrossValidation(LGBMClassifier(**self._best_Parameters), self.X, self.y, n_splits=cv, ROC=ROC, shuffle=True, mute=False, logdir_report="", display=True, save_image=True, verbose=0) return self._best_Parameters, self.results_df def SeedDiversification_cv(self, X=None, y=None, X_train=None, X_test=None, y_train=None, y_test=None, n_iter=10, n_max=2018 - 2022, cv=10, nosplit=False, finetuneseed_dir="", display=True, save_image=True, verbose=0): allmodelstart = time.time() self.get_params_json() self.finetune_dir = finetuneseed_dir + "/" + "model_finetune_seed" + str( self.name) + "/" self.params.update({ 'train_dir': self.finetune_dir, 'verbosity': verbose }) if not nosplit: self.dataset(X, y) else: self.set_dataset_nosplit(X_train, X_test, y_train, y_test) self.params.update({'verbosity': verbose}) self.model = LGBMClassifier(**self.params) _rd = np.random.uniform(0, n_max, n_iter).astype(np.int32).tolist() params_finetuneseed = {"seed": _rd} del (_rd) self._best_Parameters, self.results_df = self.FineTune( self.model, self.X, self.y, params_finetuneseed, randomized=False, cv=cv, n_iter=n_iter, verbose=1, mute=True) print("All Model Runtime: %0.2f Minutes" % ((time.time() - allmodelstart) / 60)) print( "Diversificacion de la semilla - mean AUC: {:.2f}% - std AUC: {:.5f}" .format(self.results_df['mean_test_AUC'].mean() * 100, self.results_df['std_test_AUC'].mean())) print( "Diversificacion de la semilla - mean Acc: {:.2f}% - std Acc: {:.5f}" .format(self.results_df['mean_test_Accuracy'].mean() * 100, self.results_df['std_test_Accuracy'].mean())) return self._best_Parameters, self.results_df def SeedDiversification_fs(self, X, y, params, n_iter=10, mute=False, logdir_report="", display=True, save_image=True): allmodelstart = time.time() # Run Model with different Seeds all_feature_importance_df = pd.DataFrame() _y, _ = self.replace_multiclass(y) all_seeds = np.random.uniform(1, 1000, n_iter).astype(np.int32).tolist() for seeds_x in all_seeds: modelstart = time.time() print( "Seed: ", seeds_x, ) # Go Go Go params["seed"] = seeds_x model = lgb.train(params, lgb.Dataset(X.values, label=_y.values), verbose_eval=100) # Feature Importance fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X.columns.tolist() fold_importance_df["importance"] = model.feature_importance() all_feature_importance_df = pd.concat( [all_feature_importance_df, fold_importance_df], axis=0) # Submit Model Individually # seed_submit(model= lgb_final, seed= seeds_x, X_test) if not mute: print("Model Runtime: %0.2f seconds" % ((time.time() - modelstart))) print("#" * 50) del model cols = all_feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:50].index best_features = all_feature_importance_df.loc[ all_feature_importance_df.feature.isin(cols)] plt.figure(figsize=(8, 10)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() if display: plt.show() if save_image: filename = logdir_report + 'lgb_importances.png' plt.savefig(filename) print("All Model Runtime: %0.2f Minutes" % ((time.time() - allmodelstart) / 60)) def __getattr__(self, attr): """ Pass all other method calls to self.model. """ return getattr(self.model, attr)