def main(args): # Hyper-parameters hyperparams={ 'n_estimators' : args.n_estimators, 'max_depth' : args.max_depth, 'n_bins' : args.n_bins, 'split_criterion' : args.split_criterion, 'split_algo' : args.split_algo, 'bootstrap' : args.bootstrap, 'bootstrap_features' : args.bootstrap_features, 'max_leaves' : args.max_leaves, 'max_features' : args.max_features } # SageMaker options model_dir = args.model_dir data_dir = args.data_dir col_names = ['label'] + ["col-{}".format(i) for i in range(2, 30)] # Assign column names dtypes_ls = ['int32'] + ['float32' for _ in range(2, 30)] # Assign dtypes to each column data = cudf.read_csv(data_dir+'HIGGS.csv', names=col_names, dtype=dtypes_ls) X_train, X_test, y_train, y_test = train_test_split(data, 'label', train_size=0.70) cu_rf = cuRF(**hyperparams) cu_rf.fit(X_train, y_train) print("test_acc:", accuracy_score(cu_rf.predict(X_test), y_test.to_gpu_array()))
def main(args): # Load hyperparameters hyperparams = get_hyperparameters() hyperparams = { 'n_estimators': int(hyperparams.get("n_estimators", 20)), 'max_depth': int(hyperparams.get("max_depth", 10)), 'n_bins': int(hyperparams.get("n_bins", 8)), 'split_criterion': int(hyperparams.get("split_criterion", 0)), 'split_algo': int(hyperparams.get("split_algo", 0)), 'bootstrap': hyperparams.get("bootstrap", 'true') == 'true', 'bootstrap_features': hyperparams.get("bootstrap_features", 'false') == 'true', 'max_leaves': int(hyperparams.get("max_leaves", -1)), 'max_features': float(hyperparams.get("max_features", 0.2)) } # 'split_criterion' : 0, # GINI:0, ENTROPY:1 # 'split_algo' : 0, # HIST:0 GLOBAL_QUANTILE:1 # 'bootstrap' : True, # sample with replacement # 'bootstrap_features' : False, # sample without replacement # 'max_leaves' : -1, # unlimited leaves # SageMaker options model_dir = args.model_dir data_dir = args.data_dir col_names = ['label'] + ["col-{}".format(i) for i in range(2, 30)] # Assign column names dtypes_ls = ['int32'] + ['float32' for _ in range(2, 30) ] # Assign dtypes to each column data = cudf.read_csv(data_dir + 'HIGGS.csv', names=col_names, dtype=dtypes_ls) X_train, X_test, y_train, y_test = train_test_split(data, 'label', train_size=0.70) cu_rf = cuRF(**hyperparams) cu_rf.fit(X_train, y_train) print("test_acc:", accuracy_score(cu_rf.predict(X_test), y_test.to_gpu_array()))
def main(): start_script = time.time() parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, help='location of data') parser.add_argument('--n_estimators', type=int, default=100, help='Number of trees in RF') parser.add_argument('--max_depth', type=int, default=16, help='Max depth of each tree') parser.add_argument('--n_bins', type=int, default=8, help='Number of bins used in split point calculation') parser.add_argument('--max_features', type=float, default=1.0, help='Number of features for best split') args = parser.parse_args() data_dir = args.data_dir print('\n---->>>> cuDF version <<<<----\n', cudf.__version__) print('\n---->>>> cuML version <<<<----\n', cuml.__version__) t1 = time.time() df = cudf.read_parquet(os.path.join(data_dir, 'airline_20m.parquet')) # df = cudf.read_orc(os.path.join(data_dir, 'airline_20000000.orc')) t2 = time.time() print('\n---->>>> cuDF time: {:.2f} <<<<----\n'.format(t2-t1)) X = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])] y = df['ArrDelayBinary'].astype(np.int32) del df n_estimators = args.n_estimators run.log('n_estimators', np.int(args.n_estimators)) max_depth = args.max_depth run.log('max_depth', np.int(args.max_depth)) n_bins = args.n_bins run.log('n_bins', np.int(args.n_bins)) max_features = args.max_features run.log('max_features', np.str(args.max_features)) print('\n---->>>> Training using GPUs <<<<----\n') # ---------------------------------------------------------------------------------------------------- # cross-validation folds # ---------------------------------------------------------------------------------------------------- accuracy_per_fold = []; train_time_per_fold = []; infer_time_per_fold = []; trained_model = []; global_best_model = None; global_best_test_accuracy = 0 traintime = time.time() # optional cross-validation w/ model_params['n_train_folds'] > 1 for i_train_fold in range(5): print( f"\n CV fold { i_train_fold } of { 5 }\n" ) # split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i_train_fold, shuffle = True) # train model cu_rf = cuRF(n_estimators=n_estimators, max_depth=max_depth, n_bins=n_bins, max_features=max_features) start1 = time.time() trained_model = cu_rf.fit(X_train, y_train) training_time = time.time() - start1 train_time_per_fold += [ round( training_time, 4) ] # evaluate perf start2 = time.time() cuml_pred = cu_rf.predict(X_test) infer_time = time.time() - start2 cuml_accuracy = accuracy_score(cuml_pred, y_test) * 100 accuracy_per_fold += [ round( cuml_accuracy, 4) ] infer_time_per_fold += [ round( infer_time, 4) ] # update best model [ assumes maximization of perf metric ] if cuml_accuracy > global_best_test_accuracy : global_best_test_accuracy = cuml_accuracy total_train_inference_time = time.time() - traintime run.log('Total training inference time', np.float(total_train_inference_time)) run.log('Accuracy', np.float(global_best_test_accuracy)) print( '\n Accuracy :', global_best_test_accuracy) print( '\n accuracy per fold :', accuracy_per_fold) print( '\n train-time per fold :', train_time_per_fold) print( '\n train-time all folds :', sum(train_time_per_fold)) print( '\n infer-time per fold :', infer_time_per_fold) print( '\n infer-time all folds :', sum(infer_time_per_fold)) end_script = time.time() print('Total runtime: {:.2f}'.format(end_script-start_script)) run.log('Total runtime', np.float(end_script-start_script)) print('\n Exiting script')
args = parser.parse_args() # process data df = pd.read_csv(args.data_path, index_col=None, header=None) # read it df = process_data(df) ## TRAIN # get parameters label_map = {'normal.': 0, 'anomaly.': 1} params = {'random_state':RAND_STATE, 'n_estimators':2500, 'max_depth':200, 'n_bins':20, 'max_samples':1.0, 'max_features':0.4, 'n_streams':1} mlflow.log_params(params) if(DEBUG): print(f'Random Forest with {params}') # train model model = cuRF(**params) # pseudo-cross-validation subsample_perc = 0.75 # take 75% of the data for all of the training subsets f1_train_norms,f1_train_anoms,f1_test_norms,f1_test_anoms,run_times = [],[],[],[],[] for random_state in range(1000): # number of cross-validations np.random.seed(random_state) valid_idxs = np.random.choice(df.index, size=round(subsample_perc*df.shape[0]), replace=False) # split data train, train_norm, train_anom, test_norm, test_anom = split_data(df.loc[valid_idxs]) X_train, y_train = train # unpack training data # train model start_time = time.time() # mark start model.fit(X_train, np.vectorize(label_map.get)(y_train))
X_train, y_train = train # unpack training data # score data label_map = {'normal.': 0, 'anomaly.': 1} score_df = pd.DataFrame(columns=['n_estimators','max_depth','n_bins','max_samples','max_features','run_time', 'f1_train_norm','f1_train_anom','f1_test_norm','f1_test_anom']) for n_estimators in range(25,251,25): for max_depth in range(2,21,2): for n_bins in range(2,21,2): for max_samples in range(2,11,2): max_samples /= 10 for max_features in range(2,11,2): # convert parameters max_features /= 10 # train model model = cuRF(random_state=RAND_STATE, n_estimators=n_estimators, max_depth=max_depth, n_bins=n_bins, max_samples=max_samples, max_features=max_features, n_streams=1) start_time = time.time() # mark start model.fit(X_train, np.vectorize(label_map.get)(y_train)) # score model f1_train_norm = compute_f1(model, train_norm, 0) f1_train_anom = compute_f1(model, train_anom, 1) f1_test_norm = compute_f1(model, test_norm, 0) f1_test_anom = compute_f1(model, test_anom, 1) # log time end_time = time.time() # mark end run_time = end_time - start_time # calculate runtime based on fitting and scoring # save metrics idx = score_df.shape[0]