class catboost_enc(BaseEstimator, TransformerMixin): def __init__(self, columns): self.columns = columns def fit(self, df, y=None): self.encoder = CatBoostEncoder( handle_unknown='value', cols=self.columns) #, use_cat_names=True) self.encoder = self.encoder.fit(df, y) return self def transform(self, df, y=None): df_ = df.copy() return self.encoder.transform(df_)
def _run(self): from category_encoders.cat_boost import CatBoostEncoder data = self.input[0] num_cols = self.input[1] cat_cols = self.input[2] train = data[data['isFraud'] != -1] X = train.drop('isFraud', axis=1) y = train['isFraud'].astype(np.uint8) del train encoder = CatBoostEncoder(verbose=1, cols=cat_cols) encoder.fit(X, y) cat_data: pd.DataFrame = data.drop('isFraud', axis=1) cat_data = encoder.transform(cat_data) cat_data = cat_data.join(data['isFraud']) self.output = cat_data
y_train = pickle.load(f) with open("y_test.pkl", "rb") as f: y_test = pickle.load(f) with open("label_encoder.pkl", "rb") as f: encoder = pickle.load(f) cols_cat = [ "ZONA_METROPOLITANA", "CODIGO_POSTAL", "ruido", "CALIDAD_AIRE" ] cols_float = [col for col in X_train.columns if col not in cols_cat] X_train[cols_float] = X_train[cols_float].astype("float") X_test[cols_float] = X_test[cols_float].astype("float") cat_encoder = CatBoostEncoder(cols=cols_cat) X_train = cat_encoder.fit_transform(X_train, y_train) X_test = cat_encoder.transform(X_test) if "Oeste" in X_train.columns: X_train = X_train.drop("Oeste", axis=1) X_test = X_test.drop("Oeste", axis=1) labs_names = [c for c in encoder.classes_] if not args.stacking: model = models_dic[args.model]["model"] params = models_dic[args.model]["parameters"] else: model = stacking_models[args.model]["model"] params = stacking_models[args.model]["parameters"] counter = dict(Counter(y_train)) if not args.stacking: samp_strategy = {5: int(0.11 * counter[5])} model.set_params(**{"model__sampling_strategy": samp_strategy})
def cat_encode(X, X_test, cols, y): ce = CatBoostEncoder(cols=cols) X = ce.fit_transform(X, y) X_test = ce.transform(X_test) return (X, X_test)
def main(): mlflow.start_run(run_name=NAME) if "X_train.pkl" not in os.listdir(): print("procesando los datos") X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False) print(X.shape) with open(f"label_encoder_{NAME}.pkl", "wb") as f: pickle.dump(encoder, f) print( f"##################### The shape of X is {X.shape} #######################" ) y = y.astype("int") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=15, stratify=y) with open("X_train.pkl", "wb") as f: pickle.dump(X_train, f) with open("X_test.pkl", "wb") as f: pickle.dump(X_test, f) with open("y_train.pkl", "wb") as f: pickle.dump(y_train, f) with open("y_test.pkl", "wb") as f: pickle.dump(y_test, f) print(X_train.shape) else: with open("X_train.pkl", "rb") as f: X_train = pickle.load(f) with open("X_test.pkl", "rb") as f: X_test = pickle.load(f) with open("y_train.pkl", "rb") as f: y_train = pickle.load(f) with open("y_test.pkl", "rb") as f: y_test = pickle.load(f) with open(f"label_encoder_XGB1704.pkl", "rb") as f: encoder = pickle.load(f) print("######### ajustando cat encoder ############") cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"] cols_float = [col for col in X_train.columns if col not in cols_cat] X_train[cols_float] = X_train[cols_float].astype("float") X_test[cols_float] = X_test[cols_float].astype("float") labs_names = [c for c in encoder.classes_] model = LGBMClassifier( class_weight="balanced", objective="multiclass:softmax", n_jobs=-1, random_state=100, silent=True, ) if MODE != "INDIVIDUAL": params = { "reg_alpha": (1e-3, 5.0, "log-uniform"), "reg_lambda": (1e-2, 50.0, "log-uniform"), "n_estimators": (600, 4500), "learning_rate": (5e-3, 1.0, "log-uniform"), "num_leaves": (20, 80), "boosting_type": ["gbdt", "goss"], "colsample_bytree": (0.1, 1.0, "uniform"), "subsample": (0.1, 1.0, "uniform"), "min_child_samples": (1, 25), "min_child_weight": (1e-6, 0.1, "log-uniform"), } print(params) cb = CatBoostEncoder(cols=cols_cat) X_train = cb.fit_transform(X_train, y_train) X_test = cb.transform(X_test) fit_params = { ### fit params ### "eval_set": [(X_test, y_test)], "eval_metric": lgb_f1_score, "early_stopping_rounds": 300, } pipeline = Pipeline(steps=[("clas_encoder", CatBoostEncoder( cols=cols_cat)), ("model", model)]) best_model = BayesSearchCV( model, params, n_iter=N_ITER, n_points=1, cv=cv, scoring=f2_scorer, random_state=100, optimizer_kwargs={"n_initial_points": 10}, fit_params=fit_params, ) def on_step(optim_result): score = best_model.best_score_ results = best_model.cv_results_ try: results_df = pd.DataFrame(results) results_df.to_csv(f"results_{NAME}.csv", header=True, index=False) print( f"############ Llevamos {results_df.shape[0]} pruebas #################" ) print(f"los resultados del cv de momento son {results_df}") except: print("Unable to convert cv results to pandas dataframe") mlflow.log_metric("best_score", score) with open(f"./best_{NAME}_params.pkl", "wb") as f: pickle.dump(best_model.best_params_, f) print("best score: %s" % score) if score >= 0.98: print("Interrupting!") return True print("ajustando modelo") if MODE != "INDIVIDUAL": print(X_train.dtypes) best_model.fit(X_train, y_train, callback=[on_step]) with open(f"./best_{NAME}_model.pkl", "wb") as f: pickle.dump(best_model, f) preds = best_model.predict(X_test) else: if NAME not in os.listdir(): os.mkdir(NAME) cat_encoder = CatBoostEncoder(cols=cols_cat) X_train = cat_encoder.fit_transform(X_train, y_train) X_test = cat_encoder.transform(X_test) best_model = BalancedBaggingClassifier( base_estimator=HistGradientBoostingClassifier( max_iter=3000, random_state=42, learning_rate=0.1, max_leaf_nodes=54, min_samples_leaf=2, scoring=f2_scorer, validation_fraction=0.1, n_iter_no_change=50, ), n_estimators=5, random_state=42, n_jobs=-1, max_features=0.7, sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)}, ) best_model.fit(X_train, y_train) preds = best_model.predict(X_test) print( f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}' ) print( f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) print( f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}" ) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE") with open(f"best_model_{NAME}.pkl", "wb") as f: pickle.dump(best_model, f) print("loggeando movidas") mlflow.log_metrics( metrics={ "f1": f1_score(y_test, preds, average="macro"), "precision": precision_score(y_test, preds, average="macro"), "recall": recall_score(y_test, preds, average="macro"), "accuracy": accuracy_score(y_test, preds), "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"), "f2": fbeta_score(y_test, preds, beta=2, average="macro"), }) if MODE != "INDIVIDUAL": best_params = best_model.best_params_ for param in best_params.keys(): mlflow.log_param(param, best_params[param]) cm = confusion_matrix(y_test, preds) grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names) grafico_conf_matrix.savefig(NAME) grafico_norm = print_confusion_matrix(cm, class_names=labs_names, normalize=False) grafico_norm.savefig(f"{NAME}_no_norm") mlflow.end_run()
temp[nan_indexes] = "unknown" temp[temp == "0"] = "unknown" temp[temp == "Unknown"] = "unknown" X[:, i] = temp temp = X_test[:, i] nan_indexes = pd.isnull(X_test[:, i]) temp[nan_indexes] = "unknown" temp[temp == "0"] = "unknown" temp[temp == "Unknown"] = "unknown" X_test[:, i] = temp #Encode categorical data print("Encoding data..") encoder_t = CatBoostEncoder(cols=cat_item_indexes) X = encoder_t.fit_transform(X, y) X_test = encoder_t.transform(X_test) X_test = X_test.astype(float) X_test = X_test.iloc[:, :].values X = X.astype(float) X = X.iloc[:, :].values #Scale data print("Scaling..") sc = RobustScaler() X = sc.fit_transform(X) X_test = sc.transform(X_test) #Fit model - n_estimators, max_depth & min_samples_split at current values will take a long time to run. #Reducing these values will reduce the RMSE by a small margin, but testing will be a lot faster. print("\nBeginning Gradient Boosting Regression.") gbrReg = GradientBoostingRegressor(n_estimators=3000,