def log_evaluated_results( eval_results, mlflow_tracking, fraction, n_splits, n_repeats, ): """Logs the genereted plot Args: plots (list of pandas.plot): genereted plots via Pandas df_names (list of strings): short names of dataset used model_name (string): short name of chosen model """ if mlflow_tracking: print(f"Log artifacts for model evaluated...") model_name = eval_results[0] plots = eval_results[1] scores = eval_results[2] df_names = eval_results[3] time_spent = eval_results[4] exp_id = mlflow_set_exp_id("Model:Choose") run_name = f"{model_name} : Best" with mlflow.start_run(experiment_id=exp_id, run_name=run_name): for i in range(2): fig = plots[i].get_figure() path = f"./plots/{model_name}_on_{df_names[i]}.png" mlflow.log_figure(fig, path) mlflow.log_params( { "time_spent": time_spent, "fraction": fraction, "cv_n_splits": n_splits, "cv_n_repeats": n_repeats, "random_state": rnd_state, } ) mlflow.log_metrics( { "score_on_train": scores[0], "score_on_val": scores[1], } )
def trivial_fit( X_fit, y_fit, X_train, y_train, X_val, y_val, model, log_residuals, ): print( f"\n-------------- Trivial model training w\o any parameters' searching started...." ) model_name = type(model).__name__ # Setup MLflow tracking server exp_id = mlflow_set_exp_id("Model:Fit") run_name = f"{model_name}-None" ## Enable autologging mlflow.sklearn.autolog(log_model_signatures=False) print(f"Autologging {model_name} started...") ##* Fit model with MLflow logging with mlflow.start_run(experiment_id=exp_id, run_name=run_name): tic = time.time() model.fit(X_fit, y_fit) min, sec = divmod(time.time() - tic, 60) ## Disable autologging mlflow.sklearn.autolog(disable=True) # Log custom metrics and data print(f"Training took: {int(min)}min {int(sec)}sec") print(f"Log custom metrics...") log_custom_metrics(model, X_train, y_train, X_val, y_val) if log_residuals: log_model_residuals(model, X_train, y_train, X_val, y_val) print(f"{model_name.title()} model:") print_custom_metrics(model, X_train, y_train, X_val, y_val) winsound.Beep(frequency=2000, duration=300) return model
def train_model( model, X_dev, y_dev, X_train, y_train, X_val, y_val, mlflow_tracking=True, log_residuals=True, save_mlmodel_separatly=True, ): print(f"\nTrain final model on Development set...") tic = time.time() model_name = type(model).__name__ if mlflow_tracking: # Setup MLflow tracking server exp_id = mlflow_set_exp_id("Model:Train") run_name = f"{model_name}" ## Enable autologging mlflow.sklearn.autolog() ##* Fit model with MLflow logging with mlflow.start_run(experiment_id=exp_id, run_name=run_name) as run: run_id = run.info.run_id print(f"Active run_id: {run_id} ...\n") model = model.fit(X_dev, y_dev) toc = time.time() ## Disable autologging mlflow.sklearn.autolog(disable=True) ##* Log custom metrics mare_on_dev = mare(y_dev, model.predict(X_dev)) mare_on_train = mare(y_train, model.predict(X_train)) mare_on_val = mare(y_val, model.predict(X_val)) print(f"\nMARE on DEV: {mare_on_dev}") print(f"MARE on TRAIN: {mare_on_train}") print(f"MARE on VAL: {mare_on_val}") mlflow.log_metrics({ "mare_on_dev": mare_on_dev, "mare_on_train": mare_on_train, "mare_on_val": mare_on_val, }) ##* Log custom plots if log_residuals: print(f"\nCalculate and log model's residuals...") fig = plot_residuals_errors(model, X_train, y_train, X_val, y_val) mlflow.log_figure(fig, "./plots/residuals_errors.png") else: ##* Fit trivial model = model.fit(X_dev, y_dev) toc = time.time() exp_id, run_id = None, None ## Evaluate time spent min, sec = divmod(toc - tic, 60) print(f"Model training took: {int(min)}min {int(sec)}sec\n") ## Save trained pipeline if save_mlmodel_separatly: folder = save_mlmodel_aside(model, run_id) else: print(f"No one model was NOT saved separatly...") folder = None print(f"\nExperiment ID: {exp_id}") print(f"Run ID: {run_id}") print(f"Folder: {folder}") return exp_id, run_id, folder
def choose_model(X, y, fraction, n_splits, n_repeats, n_jobs, mlflow_tracking): print(f"\nStart model selection...") # Define dataset for modeling X_fit, y_fit = get_fractioned_data(X, y, fraction) # Get list of basic models will being estimated basic_models = get_list_of_basic_models() # # Create dict for modeling results # baseline_score = get_baseline_score(y) # = { # "Baseline": { # "cv_score_mean": baseline_score, # "cv_score_std": None, # "time_spent": None, # } # } basic_results = {} # Define num. of CV splits and K-repeats scorer, cv = set_custom_scorer_cv(n_splits, n_repeats) # Starts MLflow Tracking if mlflow_tracking: # Setup MLflow tracking server exp_id = mlflow_set_exp_id("Model:Choose") # Run loop through list of basic models for basic_model in basic_models: model_name = type(basic_model).__name__ print(f"Modeling {model_name}...") # Fit each basic model via cross-validation tic = time.time() basic_model_scores = model_selection.cross_val_score( X=X_fit, y=y_fit, estimator=basic_model, scoring=scorer, cv=cv, n_jobs=n_jobs, # -1 means using all processors verbose=0, # The verbosity level. default=0 ) # Calculate time spent min, sec = divmod(time.time() - tic, 60) time_spent = f"{int(min)}min {int(sec)}sec" # Save results to dict basic_results.update( { basic_model: { "cv_score_mean": basic_model_scores.mean(), "cv_score_std": basic_model_scores.std(), "time_spent": time_spent, } } ) ##* Log models with MLflow logging if mlflow_tracking: print(f"\tLogging {model_name} results to runs...") with mlflow.start_run(experiment_id=exp_id, run_name=model_name): mlflow.log_params( { "time_spent": time_spent, "fraction": fraction, "cv_n_splits": n_splits, "cv_n_repeats": n_repeats, "random_state": rnd_state, } ) mlflow.log_metrics( { "cv_score_mean": basic_model_scores.mean(), "cv_score_std": basic_model_scores.std(), } ) # Sort dict by score basic_results = dict( sorted( basic_results.items(), key=lambda x: ( x[1]["cv_score_mean"], x[1]["cv_score_std"], x[1]["time_spent"], ), ) ) print(" ") print("-------------- Models' rating --------------") pprint(basic_results, sort_dicts=False) # Pick up best model from basic set of chosen_model = list(basic_results.keys())[0] return basic_results, chosen_model
def grid_search_cv( X_fit, y_fit, X_train, y_train, X_val, y_val, model, params_grid, scorer, cv, simple_grid_search, n_jobs, log_residuals, ): if simple_grid_search: print(f"\n-------------- Simple Grid SearchCV started....") pprint(f"Parameters' grid: {params_grid}") model_name = type(model).__name__ # Setup MLflow tracking server exp_id = mlflow_set_exp_id("Model:Fit") run_name = f"{model_name}-grid" # Enable autologging mlflow.sklearn.autolog(log_model_signatures=False) # Define SIMPLE grid search grid_search = model_selection.GridSearchCV( model, param_grid=params_grid, scoring=scorer, n_jobs=n_jobs, cv=cv, refit=True, return_train_score=True, verbose=3, ) ##* Fit model with MLflow logging with mlflow.start_run(experiment_id=exp_id, run_name=run_name): tic = time.time() model_grid_search = grid_search.fit( X_fit, y_fit, ) min, sec = divmod(time.time() - tic, 60) # Disable autologging mlflow.sklearn.autolog(disable=True) # Log custom metrics and data print(f"Simple grid search took: {int(min)}min {int(sec)}sec") print(f"Log custom metrics...") log_custom_metrics(model_grid_search, X_train, y_train, X_val, y_val) if log_residuals: log_model_residuals(model_grid_search, X_train, y_train, X_val, y_val) print( f"Simple search: Best params are:\n {model_grid_search.best_params_}" ) print(f"{model_name.title()}: Simple search:") print_custom_metrics(model_grid_search, X_train, y_train, X_val, y_val) winsound.Beep(frequency=2000, duration=300) return model, model_grid_search.best_estimator_, model_grid_search.best_params_ else: print(f"\nSkip a Simple Grid SearchCV....") return model, None, None
def bayesian_search_cv( X_fit, y_fit, X_train, y_train, X_val, y_val, model, bayes_space, scorer, cv, n_jobs, bayesian_search_params, log_residuals, ): if bayesian_search_params[0]: print( f"\n-------------- Bayesian optimization of hyper-params started...." ) pprint(f"Parameters' space: {bayes_space}") model_name = type(model).__name__ # Setup MLflow tracking server exp_id = mlflow_set_exp_id("Model:Fit") run_name = f"{model_name}-bayes" ## Enable autologging mlflow.sklearn.autolog(log_model_signatures=False) # Define bayesian search bayes_search = BayesSearchCV( model, search_spaces=bayes_space, n_iter=bayesian_search_params[1], # default 50 scoring=scorer, n_jobs=n_jobs, cv=cv, refit=True, return_train_score=True, verbose=3, random_state=rnd_state, ) # Callback handler def on_step(optim_result): """ Print scores after each iteration while performing optimization """ score = bayes_search.best_score_ print(f"...current best score: {score}") if score <= 2: print("Interrupting!") return True ##* Fit model with MLflow logging with mlflow.start_run(experiment_id=exp_id, run_name=run_name): tic = time.time() model_bayes_search = bayes_search.fit( X_fit, y_fit, callback=on_step, ) min, sec = divmod(time.time() - tic, 60) ## Disable autologging mlflow.sklearn.autolog(disable=True) # Log custom metrics and data print(f"Bayesian search took: {int(min)}min {int(sec)}sec") print(f"Log custom metrics...") log_custom_metrics(model_bayes_search, X_train, y_train, X_val, y_val) if log_residuals: log_model_residuals(model_bayes_search, X_train, y_train, X_val, y_val) print( f"Bayesian search: Best params are:\n {model_bayes_search.best_params_}" ) print(f"{model_name.title()}: Bayesian search:") print_custom_metrics(model_bayes_search, X_train, y_train, X_val, y_val) winsound.Beep(frequency=2000, duration=300) return model, model_bayes_search.best_estimator_, model_bayes_search.best_params_ else: print(f"\nSkip Bayesian Optimization....") return model, None, None
def randomized_search_cv( X_fit, y_fit, X_train, y_train, X_val, y_val, model, params_dist, scorer, cv, n_jobs, random_search_params, log_residuals, ): if random_search_params[0]: print(f"\n-------------- Randomized Grid SearchCV started....") pprint(f"Parameters' distributions: {params_dist}") model_name = type(model).__name__ # Setup MLflow tracking server exp_id = mlflow_set_exp_id("Model:Fit") run_name = f"{model_name}-rand" ## Enable autologging mlflow.sklearn.autolog(log_model_signatures=False) print(f"Autologging {model_name} started...") # Define RANDOMIZED grid search random_search = model_selection.RandomizedSearchCV( model, param_distributions=params_dist, n_iter=random_search_params[1], # default 10 scoring=scorer, n_jobs=n_jobs, cv=cv, refit=True, return_train_score=True, verbose=3, random_state=rnd_state, ) ##* Fit model with MLflow logging with mlflow.start_run(experiment_id=exp_id, run_name=run_name): tic = time.time() model_random_search = random_search.fit( X_fit, y_fit, ) min, sec = divmod(time.time() - tic, 60) ## Disable autologging mlflow.sklearn.autolog(disable=True) # Log custom metrics and data print(f"Randomized grid search took: {int(min)}min {int(sec)}sec") print(f"Log custom metrics...") log_custom_metrics(model_random_search, X_train, y_train, X_val, y_val) if log_residuals: log_model_residuals(model_random_search, X_train, y_train, X_val, y_val) print( f"Randomized search: Best params are:\n {model_random_search.best_params_}" ) print(f"{model_name.title()}: Random search:") print_custom_metrics(model_random_search, X_train, y_train, X_val, y_val) winsound.Beep(frequency=2000, duration=300) return model, model_random_search.best_estimator_, model_random_search.best_params_ else: print(f"\nSkip a Randomized Grid SearchCV....") return model, None, None