def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5): """ Evaluates the model performance on the inference set. For XGBoost we need to generate a DMatrix and then we can evaluate the model. For Random Forest, in single GPU case, we can just call .score function. And multi-GPU Random Forest needs to predict on the model and then compute the accuracy score. Parameters ---------- trained_model : The object of the trained model either of XGBoost or RandomForest X_test : dataframe The data for testing y_test : dataframe The label to be used for testing. Returns ---------- test_accuracy : float The accuracy achieved on test set duration : float The time it took to evaluate the model """ self.log_to_file(f'\n> Inferencing on test set') test_accuracy = None with PerfTimer() as inference_timer: try: if self.model_type == 'XGBoost': if 'multi' in self.compute_type: test_DMatrix = xgboost.dask.DaskDMatrix(self.client, data = X_test, label = y_test) xgb_pred = xgboost.dask.predict(self.client, trained_model, test_DMatrix).compute() xgb_pred = (xgb_pred > threshold) * 1.0 test_accuracy = accuracy_score(y_test.compute(), xgb_pred) elif 'single' in self.compute_type: test_DMatrix = xgboost.DMatrix(data = X_test, label = y_test) xgb_pred = trained_model.predict(test_DMatrix) xgb_pred = (xgb_pred > threshold) * 1.0 test_accuracy = accuracy_score(y_test, xgb_pred) elif self.model_type == 'RandomForest': if 'multi' in self.compute_type: cuml_pred = trained_model.predict(X_test).compute() self.log_to_file("\n\tPrediction complete") test_accuracy = accuracy_score(y_test.compute(), cuml_pred, convert_dtype=True) elif 'single' in self.compute_type: test_accuracy = trained_model.score( X_test, y_test.astype('int32') ) except Exception as error: self.log_to_file( '\n\n!error during inference: ' + str(error)) self.log_to_file(f'\n\tFinished inference in {inference_timer.duration:.4f} s') self.log_to_file(f'\n\tTest-accuracy: {test_accuracy}') return test_accuracy, inference_timer.duration
def train(fpath, max_depth, max_features, n_estimators): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param max_depth: RF max_depth parameter :param max_features: RF max_features parameter :param n_estimators: RF n_estimators parameter :return: trained model """ X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlparams = { "max_depth": str(max_depth), "max_features": str(max_features), "n_estimators": str(n_estimators), } mlflow.log_params(mlparams) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") return mod
def score(self, X, y, **kwargs): """Scoring function for based on mean accuracy. Parameters ---------- X : [cudf.DataFrame] Test samples on which we predict y : [cudf.Series, device array, or numpy array] Ground truth values for predict(X) Returns ------- score : float Accuracy of self.predict(X) wrt. y (fraction where y == pred_y) """ from cuml.metrics.accuracy import accuracy_score from cuml.utils import input_to_dev_array X_m = input_to_dev_array(X)[0] y_m = input_to_dev_array(y)[0] if hasattr(self, 'handle'): handle = self.handle else: handle = None return accuracy_score(y_m, cuda.to_device(self.predict(X_m)), handle=handle)
def _train(params, fpath, hyperopt=False): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param hyperopt: Use hyperopt for hyperparameter search during training. :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run) """ max_depth, max_features, n_estimators = params max_depth, max_features, n_estimators = (int(max_depth), float(max_features), int(n_estimators)) # Log all of our training parameters for this run. pyver = sys.version_info mlparams = { 'cudf_version': str(cudf.__version__), 'cuml_version': str(cuml.__version__), 'max_depth': str(max_depth), 'max_features': str(max_features), 'n_estimators': str(n_estimators), 'python_version': f"{pyver[0]}.{pyver[1]}.{pyver[2]}.{pyver[3]}", } mlflow.log_params(mlparams) X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") if not hyperopt: return mod return {"loss": acc, "status": STATUS_OK}
def _train(params, fpath, hyperopt=False): """ :param params: hyperparameters. Its structure is consistent with how search space is defined. See below. :param fpath: Path or URL for the training data used with the model. :param hyperopt: Use hyperopt for hyperparameter search during training. :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run) """ max_depth, max_features, n_estimators = params max_depth, max_features, n_estimators = (int(max_depth), float(max_features), int(n_estimators)) X_train, X_test, y_train, y_test = load_data(fpath) mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) mod.fit(X_train, y_train) preds = mod.predict(X_test) acc = accuracy_score(y_test, preds) mlparams = { "max_depth": str(max_depth), "max_features": str(max_features), "n_estimators": str(n_estimators) } mlflow.log_params(mlparams) mlflow.log_metric("accuracy", acc) mlflow.sklearn.log_model(mod, "saved_models") if (not hyperopt): return mod return {'loss': acc, 'status': STATUS_OK}
def main(): start_script = time.time() parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, help='location of data') parser.add_argument('--n_estimators', type=int, default=100, help='Number of trees in RF') parser.add_argument('--max_depth', type=int, default=16, help='Max depth of each tree') parser.add_argument('--n_bins', type=int, default=8, help='Number of bins used in split point calculation') parser.add_argument('--max_features', type=float, default=1.0, help='Number of features for best split') args = parser.parse_args() data_dir = args.data_dir print('\n---->>>> cuDF version <<<<----\n', cudf.__version__) print('\n---->>>> cuML version <<<<----\n', cuml.__version__) t1 = time.time() df = cudf.read_parquet(os.path.join(data_dir, 'airline_20m.parquet')) # df = cudf.read_orc(os.path.join(data_dir, 'airline_20000000.orc')) t2 = time.time() print('\n---->>>> cuDF time: {:.2f} <<<<----\n'.format(t2-t1)) X = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])] y = df['ArrDelayBinary'].astype(np.int32) del df n_estimators = args.n_estimators run.log('n_estimators', np.int(args.n_estimators)) max_depth = args.max_depth run.log('max_depth', np.int(args.max_depth)) n_bins = args.n_bins run.log('n_bins', np.int(args.n_bins)) max_features = args.max_features run.log('max_features', np.str(args.max_features)) print('\n---->>>> Training using GPUs <<<<----\n') # ---------------------------------------------------------------------------------------------------- # cross-validation folds # ---------------------------------------------------------------------------------------------------- accuracy_per_fold = []; train_time_per_fold = []; infer_time_per_fold = []; trained_model = []; global_best_model = None; global_best_test_accuracy = 0 traintime = time.time() # optional cross-validation w/ model_params['n_train_folds'] > 1 for i_train_fold in range(5): print( f"\n CV fold { i_train_fold } of { 5 }\n" ) # split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i_train_fold, shuffle = True) # train model cu_rf = cuRF(n_estimators=n_estimators, max_depth=max_depth, n_bins=n_bins, max_features=max_features) start1 = time.time() trained_model = cu_rf.fit(X_train, y_train) training_time = time.time() - start1 train_time_per_fold += [ round( training_time, 4) ] # evaluate perf start2 = time.time() cuml_pred = cu_rf.predict(X_test) infer_time = time.time() - start2 cuml_accuracy = accuracy_score(cuml_pred, y_test) * 100 accuracy_per_fold += [ round( cuml_accuracy, 4) ] infer_time_per_fold += [ round( infer_time, 4) ] # update best model [ assumes maximization of perf metric ] if cuml_accuracy > global_best_test_accuracy : global_best_test_accuracy = cuml_accuracy total_train_inference_time = time.time() - traintime run.log('Total training inference time', np.float(total_train_inference_time)) run.log('Accuracy', np.float(global_best_test_accuracy)) print( '\n Accuracy :', global_best_test_accuracy) print( '\n accuracy per fold :', accuracy_per_fold) print( '\n train-time per fold :', train_time_per_fold) print( '\n train-time all folds :', sum(train_time_per_fold)) print( '\n infer-time per fold :', infer_time_per_fold) print( '\n infer-time all folds :', sum(infer_time_per_fold)) end_script = time.time() print('Total runtime: {:.2f}'.format(end_script-start_script)) run.log('Total runtime', np.float(end_script-start_script)) print('\n Exiting script')