def train(fpath, max_depth, max_features, n_estimators): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param max_depth: RF max_depth parameter :param max_features: RF max_features parameter :param n_estimators: RF n_estimators parameter :return: trained model """ X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlparams = { "max_depth": str(max_depth), "max_features": str(max_features), "n_estimators": str(n_estimators), } mlflow.log_params(mlparams) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") return mod
def GridSearch_random_forest(X_train, y_train): # Encode as float32 X_train = X_train.to_numpy().astype('float32') y_train = y_train.to_numpy().astype('float32') # Init Kfolds folds = KFold(n_splits=5) # Init hyperparam vals n_estimators_lst = [128, 256, 512, 1024] max_features_lst = ['sqrt', 'log2'] fin_arr = [] # Run GridSearch for all hyperparam combos for n_estimators in n_estimators_lst: for max_features in max_features_lst: # Init clf clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features) predicted_y = [] true_y = [] # Run CV and calc metrics for train, holdout in folds.split(X_train): clf.fit(X_train[train], y_train[train]) predicted_y.append(clf.predict(X_train[holdout])) true_y.append(y_train[holdout]) predicted_y = np.concatenate(predicted_y) true_y = np.concatenate(true_y) accuracy_train = accuracy_score(true_y, predicted_y) f1_train = f1_score(true_y, predicted_y) roc_auc_train = roc_auc_score(true_y, predicted_y) fin_arr.append([ n_estimators, max_features, accuracy_train, f1_train, roc_auc_train ]) # Create final dataframe from GridSearch results fin_arr = np.array(fin_arr).reshape( (len(n_estimators_lst) * len(max_features_lst)), 5) columns = [ 'n_estimators', 'max_features', 'mean_accuracy', 'mean_f1', 'mean_auc' ] results = pd.DataFrame(data=fin_arr, columns=columns) results.n_estimators = results.n_estimators.astype(int) return results
def train_and_eval(X_param, y_param, max_depth=16, n_estimators=100): X_train, X_valid, y_train, y_valid = train_test_split(X_param, y_param, random_state=77) classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_valid) score = accuracy_score(y_valid, y_pred) return score
def _train(params, fpath, hyperopt=False): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param hyperopt: Use hyperopt for hyperparameter search during training. :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run) """ max_depth, max_features, n_estimators = params max_depth, max_features, n_estimators = (int(max_depth), float(max_features), int(n_estimators)) # Log all of our training parameters for this run. pyver = sys.version_info mlparams = { 'cudf_version': str(cudf.__version__), 'cuml_version': str(cuml.__version__), 'max_depth': str(max_depth), 'max_features': str(max_features), 'n_estimators': str(n_estimators), 'python_version': f"{pyver[0]}.{pyver[1]}.{pyver[2]}.{pyver[3]}", } mlflow.log_params(mlparams) X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") if not hyperopt: return mod return {"loss": acc, "status": STATUS_OK}
def _train(params, fpath, hyperopt=False): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param hyperopt: Use hyperopt for hyperparameter search during training. :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run) """ max_depth, max_features, n_estimators = params max_depth, max_features, n_estimators = (int(max_depth), float(max_features), int(n_estimators)) X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlparams = { "max_depth": str(max_depth), "max_features": str(max_features), "n_estimators": str(n_estimators) } mlflow.log_params(mlparams) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") if (not hyperopt): return mod return {'loss': acc, 'status': STATUS_OK}
clf_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth) print(clf_rf) clf_rf.fit(cu_X_train, cu_y_train) print('Learning done in {:.2f} seconds'.format(time() - t0)) # ### Inference # # We will use GPU-based inference to predict the classes for the test # data. print() print('Inference begins') t0 = time() pred_rf = clf_rf.predict(X_test, predict_model='GPU') pred_rf = [chr(x) for x in pred_rf + ord('A')] pred_rf = np.array(pred_rf) print('Inference done in {:.2f} seconds'.format(time() - t0)) print() print('Predicted {} digits with accuracy: {:.4f}'.format( len(pred_rf), accuracy_score(y_test, pred_rf))) print() # #### Confusion matrix, accuracy, precision, and recall # # We can also compute the confusion matrix to see which digits get # mixed the most, and look at classification accuracies separately for # each class:
def run_random_forest(scaled_df): raw_train_arr = [] raw_test_arr = [] # Over five trials for i in range(5): # Split data into train and test X_train, X_test, y_train, y_test = train_test_split( scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000) # Run GridSearch search_results = GridSearch_random_forest(X_train, y_train) results = search_results # Get optimal clfs using gridsearch results opt_acc_inf = results.sort_values(by='mean_accuracy', ascending=False).iloc[0] opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0] opt_auc_inf = results.sort_values(by='mean_auc', ascending=False).iloc[0] # Init optimal clfs opt_acc_clf = RandomForestClassifier( n_estimators=opt_acc_inf.n_estimators, max_features=opt_acc_inf.max_features) opt_f1_clf = RandomForestClassifier( n_estimators=opt_f1_inf.n_estimators, max_features=opt_f1_inf.max_features) opt_auc_clf = RandomForestClassifier( n_estimators=opt_auc_inf.n_estimators, max_features=opt_auc_inf.max_features) # Encode as float32 for cuML X_train_np = X_train.to_numpy().astype('float32') y_train_np = y_train.to_numpy().astype('float32') X_test_np = X_test.to_numpy().astype('float32') y_test_np = y_test.to_numpy().astype('float32') # Fit clfs opt_acc_clf.fit(X_train_np, y_train_np) opt_f1_clf.fit(X_train_np, y_train_np) opt_auc_clf.fit(X_train_np, y_train_np) # Get train and test metrics train_score_acc = opt_acc_clf.score(X_train_np, y_train_np) train_score_f1 = f1_score(y_train_np, opt_f1_clf.predict(X_train_np)) train_score_auc = roc_auc_score(y_train_np, opt_auc_clf.predict(X_train_np)) test_score_acc = opt_acc_clf.score(X_test_np, y_test_np) test_score_f1 = f1_score(y_test_np, opt_f1_clf.predict(X_test_np)) test_score_auc = roc_auc_score(y_test_np, opt_auc_clf.predict(X_test_np)) raw_train_arr.append( [train_score_acc, train_score_f1, train_score_auc]) raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc]) raw_train_arr = np.array(raw_train_arr).reshape(5, 3) raw_test_arr = np.array(raw_test_arr).reshape(5, 3) raw_train_df = pd.DataFrame(data=raw_train_arr, columns=['accuracy', 'f1', 'auc']) raw_test_df = pd.DataFrame(data=raw_test_arr, columns=['accuracy', 'f1', 'auc']) return raw_train_df, raw_test_df
class CUMLTrainable(tune.Trainable): def _setup(self, config): # [X_train, X_test, y_train, y_test] = get_pinned_object(data_id) self._gpu_id = os.environ.get("CUDA_VISIBLE_DEVICES", 0) #ray.get_gpu_ids()[0] print("Starting new trainable on {}.".format(self._gpu_id)) # self._wait_for_gpus() with FileLock(os.path.expanduser("~/.tune.gpulock")): X_cudf_train = cudf.DataFrame.from_pandas(X_train) self.train_mat = X_cudf_train.as_gpu_matrix(order="F") del X_cudf_train self.X_cudf_test = cudf.DataFrame.from_pandas(X_test) self.y_cudf_train = cudf.Series(y_train.values) self.y_test = y_test config = {k: int(v) for k, v in config.items()} self.cuml_model = GPURandomForestClassifier(**config) def _train(self): self.cuml_model.fit(self.train_mat, self.y_cudf_train) fil_preds_orig = self.cuml_model.predict(self.X_cudf_test) accuracy = accuracy_score(self.y_test, fil_preds_orig) return {"mean_accuracy": accuracy} def _stop(self): import time import GPUtil gpu_object = GPUtil.getGPUs()[self._gpu_id] print("Deleting the model. Mem: {:0.3f}".format(gpu_object.memoryUsed)) del self.cuml_model print("Deleting the test set. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.X_cudf_test print("Deleting the test labels. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.y_test print("Deleting the training labels. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.y_cudf_train print("Deleting the training matrix. Mem: {:0.3f}".format( gpu_object.memoryUsed)) del self.train_mat # self._wait_for_gpus(retry=1) def _wait_for_gpus(self, retry=10): import GPUtil import time gpu_object = GPUtil.getGPUs()[self._gpu_id] for i in range(int(retry)): if gpu_object.memoryUsed > 0.1: print("Waiting for GPU memory to free. Mem: {:0.3f}".format( gpu_object.memoryUsed)) time.sleep(5) time.sleep(5) def reset_config(self, config): del self.cuml_model config = {k: int(v) for k, v in config.items()} self.cuml_model = GPURandomForestClassifier(**config) return True
import pickle from datasets import prepare_dataset from cuml.ensemble import RandomForestClassifier as GPURandomForestClassifier data = prepare_dataset("/data", "airline", None) X_train, X_test, y_train, y_test = data.X_train, data.X_test, data.y_train, data.y_test y_train = y_train.astype(np.int32) y_test = y_test.astype(np.int32) QUARTER = len(X_train) // 2 X_train = X_train[QUARTER:] y_train = y_train[QUARTER:] X_cudf_train = cudf.DataFrame.from_pandas(X_train) X_cudf_test = cudf.DataFrame.from_pandas(X_test) train_mat = X_cudf_train.as_gpu_matrix(order="F") del X_cudf_train y_cudf_train = cudf.Series(y_train.values) cuml_model = GPURandomForestClassifier(n_estimators=467, max_depth=19, max_features=1.0) cuml_model.fit(train_mat, y_cudf_train) fil_preds_orig = cuml_model.predict(X_cudf_test) fil_acc_orig = accuracy_score(y_test, fil_preds_orig)