def training(grid_param, X_train, X_test, y_train, y_test): #Create cluster/client cluster = make_cluster() cluster client = Client(cluster) client #Construct Dask DataFrame X_train = dd.from_pandas(X_train, npartitions=4) y_train = dd.from_pandas(y_train, npartitions=4) X_test = dd.from_pandas(X_test, npartitions=4) y_test = dd.from_pandas(y_test, npartitions=4) estimator = RandomForestRegressor() #Train model train_time = time.time() grid_search = GridSearchCV_dask(estimator, grid_param, cv=2, n_jobs=-1) with joblib.parallel_backend("dask", scatter=[X_train, y_train]): grid_search.fit(X_train, y_train) grid_search.score(X_test, y_test) train_time = time.time() - train_time #Predictions acc_r2 = grid_search.best_estimator_.score(X_test, y_test) acc_mse = mean_squared_error(grid_search.best_estimator_.predict(X_test), y_test) return acc_r2, acc_mse, train_time
def train_evaluate(job_dir, training_dataset_path, search_space, scoring_measure): """Runs the training pipeline.""" # Load and prepare training data df_train = pd.read_csv(training_dataset_path) num_features_type_map = ( {feature: 'float64' for feature in df_train.columns[NUMERIC_FEATURE_INDEXES]}) df_train = df_train.astype(num_features_type_map) X_train = df_train.drop('Cover_Type', axis=1) y_train = df_train['Cover_Type'] # Define the training pipeline preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), NUMERIC_FEATURE_INDEXES), ('cat', OneHotEncoder(), CATEGORICAL_FEATURE_INDEXES) ]) pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', SGDClassifier()) ]) # Configure hyperparameter tuning grid = GridSearchCV(pipeline, cv=5, param_grid=search_space, scoring=scoring_measure) # Start training grid.fit(X_train, y_train) return grid
def RandomForestDask(param_grid, X_train, X_test, y_train, y_test): cluster = make_cluster() cluster client = Client(cluster) client dask_X_train = dd.from_pandas(X_train, npartitions=3) # preprocess data dask_y_train = dd.from_pandas(y_train, npartitions=3) dask_X_test = dd.from_pandas(X_test, npartitions=3) dask_y_test = dd.from_pandas(y_test, npartitions=3) estimator = RandomForestRegressor() param_grid = param_grid grid_search_dask = GridSearchCV_dask(estimator, param_grid, cv=2, n_jobs=-1) with joblib.parallel_backend("dask", scatter=[dask_X_train, dask_y_train]): grid_search_dask.fit(dask_X_train, dask_y_train) grid_search_dask.score(dask_X_test, dask_y_test) r_2 = grid_search_dask.best_estimator_.score(dask_X_test, dask_y_test) mse = mean_squared_error( grid_search_dask.best_estimator_.predict(X_test), y_test) return r_2, mse,
def run(self): self.load_data() self.split_data() # nulls = X_train.isnull().sum() # total_nulls = nulls.sum() # if total_nulls > 0: # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print(nulls[nulls > 0], "total: ", total_nulls) reg = self._create_model() gs = GridSearchCV(reg, self.model_desc["params_grid"], cv=self.model_desc["num_folds"], n_jobs=-1, refit=False) start_time = time.monotonic() with joblib.parallel_backend("dask", scatter=[self.X_train, self.y_train]): gs.fit(self.X_train, self.y_train) finish_time = time.monotonic() gs_time = finish_time - start_time print("Searching for marameters for [{}]".format( self.model_desc["algorithm_name"])) print("GridSearchCV time: {}".format(gs_time)) cv_res, best_params, gs_score = _get_best_params_score(gs) print("GridSearchCV score: {}".format(gs_score)) print("Best params: {}".format(best_params)) start_time = time.monotonic() regr_best = self._create_model(**best_params).fit( self.X_train, self.y_train) finish_time = time.monotonic() test_time = finish_time - start_time print("Final training time: {}".format(test_time)) test_score = float(regr_best.score(self.X_test, self.y_test)) print("Test score: {}".format(test_score)) ans = { "algorithm_name": self.model_desc["algorithm_name"], "gs_time": gs_time, "gs_score": gs_score, "test_time": test_time, "test_score": test_score } if "output_path" in self.model_desc: with open(self.model_desc["output_path"], "w") as fp: json.dump(ans, fp) return ans
def get_pipe(self, ): if self.inner_cv is None: inner_cv = RepeatedKFold(n_splits=self.cv_splits, n_repeats=self.cv_repeats, random_state=0) else: inner_cv = self.inner_cv gridpoints = self.gridpoints transformer_list = [None_T(), Log_T(), LogP1_T()] # ,logp1_T()] # log_T()]# steps = [ ('shrink_k1', ShrinkBigKTransformer(selector=LassoLarsCV(cv=inner_cv, max_iter=32))), # retain a subset of the best original variables ('polyfeat', PolynomialFeatures(interaction_only=0, degree=2)), # create interactions among them ('drop_constant', DropConst()), ('shrink_k2', ShrinkBigKTransformer(selector=LassoLarsCV(cv=inner_cv, max_iter=64))), # pick from all of those options ('reg', LinearRegression())] if self.bestT: steps.insert(0, ('xtransform', ColumnBestTransformer(float_k=len(self.float_idx)))) X_T_pipe = Pipeline(steps=steps) Y_T_X_T_pipe = Pipeline(steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))]) Y_T__param_grid = { 'ttr__transformer': transformer_list, 'ttr__regressor__polyfeat__degree': [2], } outerpipe = GridSearchCV(Y_T_X_T_pipe, param_grid=Y_T__param_grid, cv=inner_cv) if self.do_prep: steps = [('prep', MissingValHandler(prep_dict=self.prep_dict)), ('post', outerpipe)] outerpipe = Pipeline(steps=steps) return outerpipe
def grid_search_hyperparams(self, model, data, feature_names, hyperparam_grid, n_leave_out=1): cv_splits = LeavePGroupsOut(n_groups=n_leave_out).split( data[feature_names], np.ravel(data[self.target]), groups=data['group_id']) gs_models = GridSearchCV(model, hyperparam_grid, cv=cv_splits, scoring=self.metric, n_jobs=-1) gs_models.fit(data[feature_names], np.ravel(data[self.target])) return gs_models.best_params_
def run(self): print("Running dataset: " + self.name) for exp_param in self.param_set: param = common_param.copy() param.update(exp_param["xgb_params"]) cv_param = exp_param["search parameter"] print("Running cv grid for: " + str(param)) grid_param = cv_param # param["reg_alpha"] = 0.001 # model = xgb.XGBRegressor(**param) # model.fit(X,y) # print(model.get_booster().get_dump()[0]) # exit(0) model = xgb.XGBRegressor( **param ) if self.objective == "reg:linear" else xgb.XGBClassifier(**param) clf = GridSearchCV(model, grid_param, cv=5, n_jobs=-1) # with parallel_backend('dask'): clf.fit(self.X_train, self.y_train) # clf.fit(X_train, y_train) param.update(clf.best_params_) model = xgb.XGBRegressor( **param ) if self.objective == "reg:linear" else xgb.XGBClassifier(**param) model.fit(self.X_train, self.y_train) pred = model.predict(self.X_test) if isinstance(model, xgb.XGBRegressor): score = np.sqrt(metrics.mean_squared_error(self.y_test, pred)) else: score = 1.0 - metrics.accuracy_score(self.y_test, pred) df_cv_results.at[exp_param['name'], (self.name, self.metric)] = "{0:.4g}".format(score) best_param_value = list(clf.best_params_.values())[0] best_param_string = "{0:.4g}".format( best_param_value ) if best_param_value > 0.1 or best_param_value == 0.0 else "{0:.4e}".format( best_param_value) df_cv_results.at[exp_param['name'], (self.name, "param")] = best_param_string print(df_cv_results.to_latex())
def do_gbm_tuning(X, y, model, grid, n_cores, random_state=123, save_path=None, verbose=True): start_time = timeit.default_timer() o_print('Finding best params with 10-fold CV', verbose) rf = DaskGridSearchCV( GradientBoostingClassifier(random_state=random_state), param_grid=grid, n_jobs=n_cores, cv=10) rf.fit(X, y) means = rf.cv_results_['mean_test_score'] stds = rf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, rf.cv_results_['params']): o_print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params), verbose) elapsed_time = timeit.default_timer() - start_time o_print('CV time: ' + str(elapsed_time), verbose) o_print('', verbose) best_params = rf.best_params_ best_params.update({'score': rf.best_score_}) forest_performance = { 'score': rf.best_score_, 'fitting_time': elapsed_time } save_tuning_results(save_path, random_state, best_params, forest_performance, model=model) return (best_params, forest_performance)
def do_ada_tuning(X, y, model, grid, n_cores, random_state=123, save_path=None): start_time = timeit.default_timer() print('Finding best params with 10-fold CV') rf = DaskGridSearchCV(AdaBoostClassifier(random_state=random_state), param_grid=grid, n_jobs=n_cores, cv=10) rf.fit(X, y) means = rf.cv_results_['mean_test_score'] stds = rf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, rf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) elapsed_time = timeit.default_timer() - start_time print('CV time: ' + str(elapsed_time)) print() best_params = rf.best_params_ best_params.update({'score': rf.best_score_}) forest_performance = { 'score': rf.best_score_, 'fitting_time': elapsed_time } print(best_params) best_params.update({'base_estimator': str(best_params['base_estimator'])}) save_tuning_results(save_path, random_state, best_params, forest_performance, model=model) return (best_params, forest_performance)
def __fit_and_score(x_train, y_train, x_test, y_test, clf, cv_repeat=None, grid_search_k=None, param_grid=None, grid_scorer=None, scorers=None): """Fitting and scoring a classifier through crossvalidated gridsearch over a parameter grid. """ # copy classifier clf = clone(clf) # grid search grid_search_clf = GridSearchCV( estimator=clf, param_grid=param_grid, scoring=scorers, cv=grid_search_k, refit=grid_scorer, ) grid_search_clf = grid_search_clf.fit(x_train, y_train) # save to results result = {} result['gridsearch'] = grid_search_clf.cv_results_ result['best_params'] = grid_search_clf.best_params_ result['scores'] = _multimetric_score(grid_search_clf, x_test, y_test, scorers) result['cv_repeat'] = cv_repeat result['clf'] = clf.__class__.__name__ return result
def _fit_and_score(self, X, y, train, test): X_train = X.iloc[train] X_test = X.iloc[test] y_train = y.iloc[train] y_test = y.iloc[test] scores_by_params = dict.fromkeys(self.strategy_grid.keys()) for i, (strategy, strategy_values) in enumerate(self.strategy_grid.items()): print(f'{strategy} ...') # inner-cv model selection gscv = DaskGridSearchCV(self.pipeline, param_grid=strategy_values, scoring=self.inner_scoring, cv=self.inner_cv, refit=True) gscv.fit(X_train, y_train) # outer-cv model evaluation scores_by_params[strategy] = self._score(gscv, X_test, y_test) return scores_by_params
def subcount_forecast(data, feature): """ Creates a new a column that is the predicted value of the input feature Essentially an abstraction for 'prediction_forecasts' :param data: a pandas dataframe where each row is an hour :param feature: a String containing the feature that should be forecasted (one of: casual, registered) :return: a pandas dataframe containing the new column """ var_name = feature + "_forecast" print("\tAdding {} variable...".format(var_name)) df = dd.get_dummies(data.copy().drop("cnt", axis=1)) to_predict = dd.read_csv(PATH)[feature] df[feature] = to_predict train = get_train(df) model = RandomForestRegressor(random_state=SEED) model_params = {"n_estimators": list(range(10, 110, 10))} #tscv = TimeSeriesSplit(n_splits=5) grid_search = GridSearchCV(estimator=model, param_grid=model_params, scoring="r2", cv=None, refit=True) grid_search.fit(train.drop(feature, axis=1), train[feature]) print("\t\tPredictions for GridSearchCV on {}: {:.5f} +/- {:.5f}".format( feature, grid_search.best_score_, grid_search.cv_results_["std_test_score"][da.argmax( grid_search.cv_results_["mean_test_score"])])) data[var_name] = grid_search.best_estimator_.predict( dd.get_dummies(data.drop("cnt", axis=1))) return data
def search(model, X, y, params, method="randomized", n_iter=30, cv=5, **kwargs): """Run a cross-validated search for hyperparameters.""" if method.lower() == "randomized": search = RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter, cv=cv) elif method.lower() == "grid": search = GridSearchCV(model, param_grid=params, cv=cv) elif method.lower() == "bayes": search = BayesSearchCV(model, search_spaces=params, n_iter=n_iter, cv=cv) else: message = ("'method' must be either 'randomized', 'grid' or 'bayes'." " Got method='{}'".format(method)) LOGGER.error(message) raise ValueError(message) method_name = method.capitalize() + "SearchCV" LOGGER.info("Beginning " + method_name) when_started = time() progress(search.fit(X, y)) total_time = time() - when_started n_settings = len(search.cv_results_['params']) LOGGER.warn( "{} took {:.2f} seconds for {} candidates parameter settings.".format( method_name, total_time, n_settings)) return search
def test_gridsearch(): X, y = make_classification(n_samples=100, n_features=5, chunks=50) grid = {"logisticregression__C": [1000, 100, 10, 2]} pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) search = GridSearchCV(pipe, grid, cv=3) search.fit(X, y)
def rbf_svr_tuning(c = [0.001, 0.01, 0.1, 1, 10], gamma = [0.001, 0.01, 0.1, 1, 10], k = 5, train_data_path = '../data/training_data.csv', save_model = False, tracking_uri = "http://0.0.0.0:5000"): # Log the parameters with mlflow mlflow.log_param("c", c) mlflow.set_tag("k", k) # Set random seed for reproducibility np.random.seed(RANDOM_SEED) random.seed(RANDOM_SEED) # Get data shuffled and split into training and test sets mdr = MiningDataReader(path = train_data_path) (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data() pipeline = Pipeline(steps = [('scaling', StandardScaler()), ('regression', SVR(kernel = 'rbf'))]) ### TRAINING ### ################ # Generate grid search for hyperparam tuning hyperparams = {} hyperparams['regression__C'] = c hyperparams['regression__gamma'] = gamma print("Training started...\n") # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors modelCV = GridSearchCV(estimator = pipeline, param_grid = hyperparams, cv = k, scoring = 'neg_mean_squared_error', n_jobs = -1) with ProgressBar(): modelCV.fit(X_train, y_train) # Iterate over the results storing training error for each hyperparameter combination results = modelCV.cv_results_ param_list, training_err_list, training_dev_list = [], [], [] for i in range(len(results['params'])): param = results['params'][i] score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE std = results['std_test_score'][i] param_list.append(param) training_err_list.append(score) training_dev_list.append(std) print(f"\nBest parameter set found for the training set:\n{modelCV.best_params_}") # Store the index of the best combination best_index = param_list.index(modelCV.best_params_) # Get the best values for hyperparams best_c = modelCV.best_params_['regression__C'] best_gamma = modelCV.best_params_['regression__gamma'] print("\nTraining finished. Evaluating model...\n") ### EVALUATION ### ################## # Criteria is C criteria = 'c' mlflow.set_tag("criteria", criteria) param_values = c # Predict test data variying criteria param and evaluate the models training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], [] rmse_score, mae_score, r2_score = -1, -1, -1 feature_names, feature_importances = [], [] for param_value in tqdm(param_values): model = Pipeline(steps = [('scaler', StandardScaler()), ('regression', SVR( C = param_value, gamma = best_gamma, kernel = 'rbf'))]) param = {'regression__C': param_value, 'regression__gamma': best_gamma} # Fit model and evaluate results model.fit(X_train, y_train) prediction = model.predict(X_test) index = param_list.index(param) training_err = training_err_list[index] training_dev = training_dev_list[index] (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction) # Store metrics training_err_by_criteria.append(training_mse) training_dev_by_criteria.append(training_dev) test_err_list.append(test_mse) # Set aditional metrics for the best combination if index == best_index: rmse_score = rmse mae_score = mae r2_score = r2 # Generate the plots empty_img_folder() plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list) # Once hyperparameters are selected, train and save the best model if save_model: print("\nEvaluation finished. Training final model with train + test data with the best hyperparameters...") final_model = Pipeline(steps = [('scaler', StandardScaler()), ('regression', SVR( C = param_list[best_index]['regression__C'], gamma = best_gamma, kernel = 'rbf'))]) # Train the best model with all the data (training + test) full_X = np.vstack((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) final_model.fit(full_X, full_y) # Log plots and model with mlflow mlflow.log_artifacts('./img') mlflow.sklearn.log_model(final_model, 'model') # Log results with mlflow mlflow.log_metric("train_mse", training_err_list[best_index]) mlflow.log_metric("test_mse", min(test_err_list)) mlflow.log_metric("rmse", rmse_score) mlflow.log_metric("mae", mae_score) mlflow.log_metric("r2", r2_score) mlflow.set_tag("best_params", param_list[best_index]) # Output the results print(f''' ----------------------------------------------------------------------------------------------------------------------- RESULTS ----------------------------------------------------------------------------------------------------------------------- Best params: {param_list[best_index]} Training MSE: {training_err_list[best_index]} Test MSE: {min(test_err_list)} RMSE: {rmse_score} MAE: {mae_score} R2: {r2_score} ----------------------------------------------------------------------------------------------------------------------- ''')
with ProgressBar(): parallel_nb.fit(X_train, y_train, classes=np.unique(y_train.compute())) print('\n\nNaive Bayes Classifier Score : ', parallel_nb.score(X_test, y_test)) ##### OUTPUT --------> Naive Bayes Classifier Score : 0.65 ###################################################################################### # Performing GridSearch on the Logistic Regression Classifier from dask_ml.model_selection import GridSearchCV parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]} lr = LogisticRegression() tuned_lr = GridSearchCV(lr, parameters) with ProgressBar(): tuned_lr.fit(X_train, y_train) print('\n\nGrid Search Results for Logistic Regression') print(pd.DataFrame(tuned_lr.cv_results_)[['params', 'mean_test_score']]) #### OUTPUT #### Grid Search Results for Logistic Regression #### params mean_test_score #### 0 {'C': 0.5, 'penalty': 'l1'} 0.700778 #### 1 {'C': 0.5, 'penalty': 'l2'} 0.700306 #### 2 {'C': 1, 'penalty': 'l1'} 0.700806 #### 3 {'C': 1, 'penalty': 'l2'} 0.700500 #### 4 {'C': 2, 'penalty': 'l1'} 0.700972
def cart_tuning(max_depth=None, min_samples_leaf=[1, 2, 1], min_samples_split=[2, 3, 1], k=5, train_data_path='../data/training_data.csv', save_model=False, tracking_uri="http://0.0.0.0:5000"): mlflow.log_param('max_depth', max_depth) mlflow.log_param('min_samples_leaf', min_samples_leaf) mlflow.log_param('min_samples_split', min_samples_split) mlflow.set_tag("k", k) # Set random seed for reproducibility np.random.seed(RANDOM_SEED) random.seed(RANDOM_SEED) # Get data shuffled and split into training and test sets mdr = MiningDataReader(path=train_data_path) (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data() pipeline = Pipeline(steps=[('scaler', StandardScaler( )), ('regression', DecisionTreeRegressor(random_state=RANDOM_SEED))]) ### TRAINING ### ################ # Generate grid search for hyperparam tuning hyperparams = {} hyperparams['regression__max_depth'] = [ None ] if max_depth is None else np.arange(max_depth[0], max_depth[1], max_depth[2]) hyperparams['regression__min_samples_leaf'] = np.arange( min_samples_leaf[0], min_samples_leaf[1], min_samples_leaf[2]) hyperparams['regression__min_samples_split'] = np.arange( min_samples_split[0], min_samples_split[1], min_samples_split[2]) print("Training started...\n") # Create an instance of Decision Tree Regressor and fit the data for the grid parameters using all processors modelCV = GridSearchCV(estimator=pipeline, param_grid=hyperparams, cv=k, scoring='neg_mean_squared_error', n_jobs=-1) with ProgressBar(): modelCV.fit(X_train, y_train) # Iterate over the results storing training error for each hyperparameter combination results = modelCV.cv_results_ param_list, training_err_list, training_dev_list = [], [], [] for i in range(len(results['params'])): param = results['params'][i] score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE std = results['std_test_score'][i] param_list.append(param) training_err_list.append(score) training_dev_list.append(std) print( f"\nBest parameters set found for the training set:\n{modelCV.best_params_}" ) # Store the index of the best combination best_index = param_list.index(modelCV.best_params_) # Get the best values for hyperparams best_depth = modelCV.best_params_['regression__max_depth'] best_samples_leaf = modelCV.best_params_['regression__min_samples_leaf'] best_samples_split = modelCV.best_params_['regression__min_samples_split'] print("\nTraining finished. Evaluating model...\n") ### EVALUATION ### ################## # Select the hyperparam with most values as the criteria for the study and calculate test error with the best value # obtained for the other hyperparameters so the individual effect of this parameter can be studied criteria = [('max_depth', len(hyperparams['regression__max_depth'])), ('min_samples_leaf', len(hyperparams['regression__min_samples_leaf'])), ('min_samples_split', len(hyperparams['regression__min_samples_split']))] criteria = sorted(criteria, key=lambda x: x[1], reverse=True)[0][0] mlflow.set_tag("criteria", criteria) param_values = [] if criteria == 'max_depth': if max_depth is None: param_values = [None] else: param_values = range(max_depth[0], max_depth[1], max_depth[2]) elif criteria == 'min_samples_leaf': param_values = range(min_samples_leaf[0], min_samples_leaf[1], min_samples_leaf[2]) else: param_values = range(min_samples_split[0], min_samples_split[1], min_samples_split[2]) # Predict test data variying criteria param and evaluate the models training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], [] rmse_score, mae_score, r2_score = -1, -1, -1 feature_names, feature_importances = [], [] for param_value in tqdm(param_values): if criteria == 'max_depth': model = Pipeline(steps=[('scaler', StandardScaler()), ('regression', DecisionTreeRegressor( max_depth=param_value, min_samples_leaf=best_samples_leaf, min_samples_split=best_samples_split, random_state=RANDOM_SEED))]) param = { 'regression__max_depth': param_value, 'regression__min_samples_leaf': best_samples_leaf, 'regression__min_samples_split': best_samples_split } elif criteria == 'min_samples_leaf': model = Pipeline(steps=[('scaler', StandardScaler()), ('regression', DecisionTreeRegressor( max_depth=best_depth, min_samples_leaf=param_value, min_samples_split=best_samples_split, random_state=RANDOM_SEED))]) param = { 'regression__max_depth': best_depth, 'regression__min_samples_leaf': param_value, 'regression__min_samples_split': best_samples_split } else: model = Pipeline(steps=[('scaler', StandardScaler()), ('regression', DecisionTreeRegressor( max_depth=best_depth, min_samples_leaf=best_samples_leaf, min_samples_split=param_value, random_state=RANDOM_SEED))]) param = { 'regression__max_depth': best_depth, 'regression__min_samples_leaf': best_samples_leaf, 'regression__min_samples_split': param_value } # Fit model and evaluate results model.fit(X_train, y_train) prediction = model.predict(X_test) index = param_list.index(param) training_err = training_err_list[index] training_dev = training_dev_list[index] (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction) # Store metrics training_err_by_criteria.append(training_mse) training_dev_by_criteria.append(training_dev) test_err_list.append(test_mse) # Set aditional metrics for the best combination if index == best_index: rmse_score = rmse mae_score = mae r2_score = r2 # Generate the plots empty_img_folder() plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list) # Once hyperparameters are selected, train and save the best model if save_model: print( "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..." ) final_model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', DecisionTreeRegressor( max_depth=param_list[best_index] ['regression__max_depth'], min_samples_leaf=param_list[best_index] ['regression__min_samples_leaf'], min_samples_split=param_list[best_index] ['regression__min_samples_split']))]) # Train the best model with all the data (training + test) full_X = np.vstack((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) final_model.fit(full_X, full_y) # Get a barplot with feature importances feature_importances = final_model.named_steps[ 'regression'].feature_importances_ plot_feature_importances(feature_importances, variable_names) # Create a visual representation of the tree and convert it to PNG tree_graph = tree.export_graphviz( final_model.named_steps['regression'], out_file='/tmp/tree.dot', max_depth=4) (graph, ) = pydot.graph_from_dot_file('/tmp/tree.dot') graph.write_png('./img/tree.png') # Log plots and model with mlflow mlflow.log_artifacts('./img') mlflow.sklearn.log_model(final_model, 'model') # Log results with mlflow mlflow.log_metric("training_mse", training_err_list[best_index]) mlflow.log_metric("test_mse", min(test_err_list)) mlflow.log_metric("rmse", rmse_score) mlflow.log_metric("mae", mae_score) mlflow.log_metric("r2", r2_score) mlflow.set_tag("best_params", param_list[best_index]) # Output the results print(f''' ----------------------------------------------------------------------------------------------------------------------- RESULTS ----------------------------------------------------------------------------------------------------------------------- Best params: {param_list[best_index]} Training MSE: {training_err_list[best_index]} Test MSE: {min(test_err_list)} RMSE: {rmse_score} MAE: {mae_score} R2: {r2_score} ----------------------------------------------------------------------------------------------------------------------- ''')
def xgb_reg(n_estimators=100, max_depth=6, learning_rate=0.05, k=5, train_data_path='../data/training_data.csv', save_model=False, tracking_uri="http://0.0.0.0:5000"): # Log the parameters with mlflow mlflow.log_param("n_estimators", n_estimators) mlflow.log_param("max_depth", max_depth) mlflow.log_param("learning_rate", learning_rate) mlflow.set_tag("k", k) # Set random seed for reproducibility np.random.seed(RANDOM_SEED) random.seed(RANDOM_SEED) # Get data shuffled and split into training and test sets mdr = MiningDataReader(path=train_data_path) (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data() pipeline = Pipeline(steps=[('scaling', StandardScaler()), ('regression', xgb.XGBRegressor(objective="reg:squarederror", seed=RANDOM_SEED))]) ### TRAINING ### ################ # Generate grid search for hyperparam tuning hyperparams = {} hyperparams['regression__n_estimators'] = np.arange( n_estimators[0], n_estimators[1], n_estimators[2]) hyperparams['regression__max_depth'] = np.arange(max_depth[0], max_depth[1], max_depth[2]) hyperparams['regression__learning_rate'] = learning_rate print("Training started...\n") # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors modelCV = GridSearchCV(estimator=pipeline, param_grid=hyperparams, cv=k, scoring='neg_mean_squared_error', n_jobs=-1) with ProgressBar(): modelCV.fit(X_train, y_train) # Iterate over the results storing training error for each hyperparameter combination results = modelCV.cv_results_ param_list, training_err_list, training_dev_list = [], [], [] for i in range(len(results['params'])): param = results['params'][i] score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE std = results['std_test_score'][i] param_list.append(param) training_err_list.append(score) training_dev_list.append(std) best_params = modelCV.best_params_ print(f"\nBest parameter set found for the training set:\n{best_params}") # Store the index of the best combination best_index = param_list.index(best_params) # Get the best values for hyperparams best_n_estimators = best_params['regression__n_estimators'] best_max_depth = best_params['regression__max_depth'] best_learning_rate = best_params['regression__learning_rate'] print("\nTraining finished. Evaluating model...\n") ### EVALUATION ### ################## # Criteria is C criteria = 'n_estimators' mlflow.set_tag("criteria", criteria) param_values = hyperparams['regression__n_estimators'] # Predict test data variying criteria param and evaluate the models training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], [] rmse_score, mae_score, r2_score = -1, -1, -1 feature_names, feature_importances = [], [] for param_value in tqdm(param_values): model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', xgb.XGBRegressor(objective="reg:squarederror", n_estimators=param_value, max_depth=best_max_depth, learning_rate=best_learning_rate))]) param = { 'regression__n_estimators': param_value, 'regression__max_depth': best_max_depth, 'regression__learning_rate': best_learning_rate } # Fit model and evaluate results model.fit(X_train, y_train) prediction = model.predict(X_test) index = param_list.index(param) training_err = training_err_list[index] training_dev = training_dev_list[index] (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction) # Store metrics training_err_by_criteria.append(training_mse) training_dev_by_criteria.append(training_dev) test_err_list.append(test_mse) # Set aditional metrics for the best combination if index == best_index: rmse_score = rmse mae_score = mae r2_score = r2 # Generate the plots empty_img_folder() plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list) # Once hyperparameters are selected, train and save the best model if save_model: print( "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..." ) final_model = Pipeline( steps=[('scaler', StandardScaler()), ('regression', xgb.XGBRegressor(objective="reg:squarederror", n_estimators=best_n_estimators, max_depth=best_max_depth, learning_rate=best_learning_rate))]) # Train the best model with all the data (training + test) full_X = np.vstack((X_train, X_test)) full_y = np.concatenate((y_train, y_test)) final_model.fit(full_X, full_y) # Plot importances and final tree ax = xgb.plot_importance(final_model.named_steps['regression']) fig = ax.figure fig.savefig('./img/importances.png', bbox_inches='tight') plt.close(fig) ax = xgb.plot_tree(final_model.named_steps['regression'], rankdir='LR') fig = ax.figure fig.set_size_inches(30, 15) fig.savefig('./img/tree.png', dpi=400, bbox_inches='tight') plt.close(fig) # Log plots and model with mlflow mlflow.log_artifacts('./img') mlflow.sklearn.log_model(final_model, 'model') # Log results with mlflow mlflow.log_metric("train_mse", training_err_list[best_index]) mlflow.log_metric("test_mse", min(test_err_list)) mlflow.log_metric("rmse", rmse_score) mlflow.log_metric("mae", mae_score) mlflow.log_metric("r2", r2_score) mlflow.set_tag("best_params", param_list[best_index]) # Output the results print(f''' ----------------------------------------------------------------------------------------------------------------------- RESULTS ----------------------------------------------------------------------------------------------------------------------- Best params: {param_list[best_index]} Training MSE: {training_err_list[best_index]} Test MSE: {min(test_err_list)} RMSE: {rmse_score} MAE: {mae_score} R2: {r2_score} ----------------------------------------------------------------------------------------------------------------------- ''')
def train_model(x_train, x_test, y_train, alphas, l1_ratios, n_folds=5, max_iter=1000): """ Build the logic and sklearn pipelines to train x matrix based on input y Arguments: x_train - pandas DataFrame of feature matrix for training data x_test - pandas DataFrame of feature matrix for testing data y_train - pandas DataFrame of processed y matrix (output from align_matrices()) alphas - list of alphas to perform cross validation over l1_ratios - list of l1 mixing parameters to perform cross validation over n_folds - int of how many folds of cross validation to perform max_iter - the maximum number of iterations to test until convergence Output: The full pipeline sklearn object and y matrix predictions for training, testing, and cross validation """ # Setup the classifier parameters clf_parameters = { "classify__loss": ["log"], "classify__penalty": ["elasticnet"], "classify__alpha": alphas, "classify__l1_ratio": l1_ratios, } estimator = Pipeline(steps=[( "classify", SGDClassifier( random_state=0, class_weight="balanced", loss="log", max_iter=max_iter, tol=1e-3, ), )]) cv_pipeline = GridSearchCV( estimator=estimator, param_grid=clf_parameters, n_jobs=-1, cv=n_folds, scoring="roc_auc", return_train_score=True, ) # Fit the model cv_pipeline.fit(X=x_train, y=y_train.status) # Obtain cross validation results y_cv = cross_val_predict( cv_pipeline.best_estimator_, X=x_train, y=y_train.status, cv=n_folds, method="decision_function", ) # Get all performance results y_predict_train = cv_pipeline.decision_function(x_train) y_predict_test = cv_pipeline.decision_function(x_test) return cv_pipeline, y_predict_train, y_predict_test, y_cv
from sklearn.datasets import load_digits import sklearn.metrics as skmetrics from sklearn.svm import SVC run = Run.get_context() # For a bigger dataset, try # x, y = make_classification(5000, n_classes=16, n_informative=13) digits = load_digits() x = digits.data y = digits.target # Obtain cross validation performance for single parameter setting: param_space = {'C': [1]} model = SVC(kernel='rbf') if args.cv: search = GridSearchCV(model, param_space, scoring=['accuracy'], refit=False, cv=args.cv) search.fit(x, y) run.log('accuracy_mean', search.cv_results_['mean_test_accuracy'][0]) run.log('accuracy_std', search.cv_results_['std_test_accuracy'][0]) else: x_train, x_test, y_train, y_test = train_test_split(x, y) model.fit(x_train, y_train) y_test_pred = model.predict(x_test) run.log('accuracy', skmetrics.accuracy_score(y_test, y_test_pred))
from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from dask_ml.model_selection import GridSearchCV from dask.distributed import Client from sklearn.externals import joblib def simple_nn(hidden_neurons): model = Sequential() model.add(Dense(hidden_neurons, activation='relu', input_dim=30)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) return model param_grid = {'hidden_neurons': [100, 200, 300]} if __name__=='__main__': client = Client() cv = GridSearchCV(KerasClassifier(build_fn=simple_nn, epochs=100), param_grid) X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) with joblib.parallel_backend("dask", scatter=[X_train, y_train]): cv.fit(X_train, y_train) print(f'Best Accuracy for {cv.best_score_:.4} using {cv.best_params_}')
def optimize_it(query, truthfile, db, function, taxlevel=6, n_jobs=-1, grid='simple', **kwargs): taxon = taxlevels[taxlevel] prefix = os.path.basename(query) prefix = os.path.splitext(prefix)[0] mock = prefix.split('_')[0] name = function.__name__.split('_')[1] pickle_pref = '%s_%s_%s' % (prefix, name, taxon) print('Optimizing %s in %s' % (name, taxon)) if os.path.isfile('%s.pckl' % pickle_pref): print('Loading previous run') with open('%s.pckl' % pickle_pref, 'rb') as p: d = dill.load(p) else: X, y, truth = fasta2array(query, truthfile) params = product(*[kv for kv in kwargs.values()]) pnames = [x[0] for x in product(kwargs)] if os.path.isfile('%s_split.pckl' % prefix): with open('%s_split.pckl' % prefix, 'rb') as f: X_train, X_test, y_train, y_test = dill.load(f) else: X_train, X_test, y_train, y_test = split_train( X.copy(deep=True), y.copy(deep=True)) with open('%s_split.pckl' % prefix, 'wb') as f: dill.dump((X_train, X_test, y_train, y_test), f) query_train = array2fasta(X_train) query_test = array2fasta(X_test) prefix = query[query.rfind('.fa')] f_param = dict(db=db, taxlevel=taxlevel, asfile=False, prefix=prefix, query=query_train) if grid == 'simple': asfile = False if os.path.isfile('%s_training.tsv' % pickle_pref): scores = pd.read_csv('%s_training.tsv' % pickle_pref, sep='\t') else: tax = taxlevels[taxlevel] delayed_results = [ dask.delayed( compute_simple_grid(function, i, pnames, f_param, y_train, tax)) for i in tqdm(params) ] with ProgressBar(): scores = dask.compute(*delayed_results) scores = pd.concat(scores) scores.to_csv('%s_training.tsv' % pickle_pref, sep='\t') highcols = ['MCC'] if "hmmufotu" in name: highcols = ['F1S'] if 'blast' in name: highcols += ['evalue', 'max_target_seqs'] lowcols = ['p_id'] elif 'lca' in name: highcols += ['evalue'] lowcols = ['p_id', 'm_hit', 'p_hit'] else: lowcols = None best = scores.nlargest(1, columns=highcols, keep='all') if lowcols is not None: best = best.nsmallest(1, columns=lowcols) best = best.reset_index(drop=True) try: best = best.loc[0, pnames].to_dict() except TypeError: print(best) raise else: print(' Performing Grid Search CV') asfile = True with ProgressBar(): tuned_parameters = [kwargs] p = dict(program=function, db=db, taxlevel=taxlevel, asfile=False, prefix=prefix) c = GridSearchCV(Estimator(**p), tuned_parameters, cv=3, n_jobs=n_jobs, scoring=mathews_scorer).fit(X, y) with open('%s_trainingres.pckl' % prefix, 'wb') as f: dill.dump(c.cv_results_, f) best = c.best_params_ print('Processing best parameters in %s' % name) print(best) with ResourceProfiler(dt=0.01) as prof: start = time.time() results = function(prefix=prefix, db=db, query=query_test, asfile=asfile, taxlevel=taxlevel, **best) elapsed = time.time() - start results = results.replace(r'^\s*$', np.nan, regex=True) with open('%s_cvresults.pckl' % pickle_pref, 'wb') as q: dill.dump((results, y), q) resources = pd.DataFrame(data=prof.results).mean() resources.rename(columns={'time': 'timestamp'}) resources['time'] = elapsed sc = [ score(results, y_test, resources, taxa) for taxa in taxlevels.values() ] d = pd.DataFrame(data=sc) d['Method'] = name d['Mock'] = mock print(d) with open('%s.pckl' % pickle_pref, 'wb') as p: dill.dump(d, p) return d
n_sig = y_train[y_train == 1].shape[0] n_bkg = y_train[y_train == 0].shape[0] spw = n_bkg / n_sig n_sig = y[y == 1].shape[0] n_bkg = y[y == 0].shape[0] spw = n_bkg / n_sig print(spw) search_parameters = { "learning_rate": [0.02, 0.05, 0.1], "num_leaves": [20, 50, 150, 200], "min_child_samples": [40, 60, 100, 160, 240], "max_depth": [3, 4, 5, 6, 7, 8], } clf = lgbm.LGBMClassifier(boosting_type="gbdt", scale_pos_weight=spw, n_estimators=1000) fit_params = { "early_stopping_rounds": 15, "eval_metric": "auc", "eval_set": [(X_test, y_test)], "eval_sample_weight": [w_test], } search = GridSearchCV(clf, param_grid=search_parameters, cv=2) search.fit(df, y, **fit_params)
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split from dask_ml.model_selection import GridSearchCV from dask.distributed import Client from sklearn.pipeline import make_pipeline from dask_ml.preprocessing import StandardScaler from dask_ml.linear_model import LogisticRegression if __name__ == "__main__": client = Client() data = Path('./data') df = pd.read_csv(data / "01_heights_weights_genders.csv") y = 1 * (df.Gender == "Male").values X = df[['Height', 'Weight']].values X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline = make_pipeline(StandardScaler(), LogisticRegression()) grid = GridSearchCV(pipeline, param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5) grid.fit(X_train, y_train) print("Score", grid.score(X_test, y_test))