gpu_dfs = [ dask.delayed(xgb.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs ] gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs] gc.collect() wait(gpu_dfs) end = time.time() print("****Data Convertion done. Time used: ", end - start) # #### Train the Gradient Boosted Decision Tree with a single call to # ```python # dask_xgboost.train(client, params, data, labels, num_boost_round=dxgb_gpu_params['nround']) # ``` # In[ ]: start = time.time() print("starting training----") # %%time labels = None bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround']) end = time.time() print("****Training done. Time used: ", end - start)
def process(self, inputs): import gc # python standard lib garbage collector import xgboost as xgb from dask.delayed import delayed from dask.distributed import (wait, get_worker) import dask_xgboost as dxgb_gpu logmgr = MortgagePluginsLoggerMgr() logger = logmgr.get_logger() filter_dask_logger = self.conf.get('filter_dask_logger') client = self.conf['client'] client.run(init_workers_logger) dxgb_gpu_params = self.conf['dxgb_gpu_params'] delete_dataframes = self.conf.get('delete_dataframes') create_dmatrix_serially = self.conf.get('create_dmatrix_serially') mortgage_feat_df_delinq_df_pandas_futures = inputs[0] def make_xgb_dmatrix(mortgage_feat_df_delinq_df_pandas_tuple, delete_dataframes=None): worker = get_worker() logname = 'make_xgb_dmatrix' logmgr = MortgagePluginsLoggerMgr(worker, logname) logger = logmgr.get_logger() logger.info('CREATING DMATRIX ON WORKER {}'.format(worker.name)) (mortgage_feat_df, delinq_df) = \ mortgage_feat_df_delinq_df_pandas_tuple dmat = xgb.DMatrix(mortgage_feat_df, delinq_df) if delete_dataframes: del (mortgage_feat_df) del (delinq_df) # del(mortgage_feat_df_delinq_df_pandas_tuple) gc.collect() logmgr.cleanup() return dmat dmatrix_delayed_list = [] nworkers = len(mortgage_feat_df_delinq_df_pandas_futures) if create_dmatrix_serially: logger.info( 'CREATING DMATRIX SERIALLY ACROSS {} WORKERS'.format(nworkers)) else: logger.info( 'CREATING DMATRIX IN PARALLEL ACROSS {} WORKERS'.format( nworkers)) for ifut in mortgage_feat_df_delinq_df_pandas_futures: dmat_delayed = delayed(make_xgb_dmatrix)(ifut, delete_dataframes) dmat_delayed_persist = dmat_delayed.persist() if create_dmatrix_serially: # TODO: For multinode efficiency need to poll the futures # such that only doing serial dmatrix creation on the # same node, but across nodes should be in parallel. wait(dmat_delayed_persist) dmatrix_delayed_list.append(dmat_delayed_persist) wait(dmatrix_delayed_list) if filter_dask_logger: wlogs = client.get_worker_logs() print_distributed_dask_hijacked_logs(wlogs, logger, ('make_xgb_dmatrix', )) client.run(restore_workers_logger) logger.info('JUST AFTER DMATRIX') print_ram_usage() logger.info('RUNNING XGBOOST TRAINING USING DASK-XGBOOST') labels = None bst = dxgb_gpu.train(client, dxgb_gpu_params, dmatrix_delayed_list, labels, num_boost_round=dxgb_gpu_params['nround']) logmgr.cleanup() return bst
# In[5]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # In[6]: params = { 'objective': 'reg:squarederror', 'n_estimators': 100000, 'max_depth': 4, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 0.5 } bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100) # In[7]: y_hat = dask_xgboost.predict(client, bst, X_test).persist() y_hat # In[8]: r = r2_score(y_test.compute(), y_hat.compute()) mae = mean_absolute_error(y_test.compute(), y_hat.compute()) mse = mean_squared_error(y_test.compute(), y_hat.compute()) print("R^2:", r) print("MAE:", mae) print("MSE:", mse)
for col in vars_cat: dx_all[col] = preprocessing.LabelEncoder().fit_transform(dx_all[col]) X_all = dx_all[vars_cat+vars_num].to_dask_array(lengths=True) y_all = da.where((dx_all["dep_delayed_15min"]=="Y").to_dask_array(lengths=True),1,0) X_train = X_all[0:d_train.shape[0],] y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),] y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] X_train.persist() y_train.persist() client.has_what() param = {'objective':'binary:logistic', 'tree_method':'hist', 'max_depth':10, 'eta':0.1} %time md = dxgb.train(client, param, X_train, y_train, num_boost_round = 100) y_pred = dxgb.predict(client, md, X_test) y_pred_loc = y_pred.compute() y_test_loc = y_test.compute() print(metrics.roc_auc_score(y_test_loc, y_pred_loc)) ## m5.4xlarge 16c (8+8HT) ## Wall time: 34.3 s ## 0.7928378346764724
def main(): print("Setting up data directory") print("-------------------------") #flights(args.url) columns = ['Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'Origin'] data_dir = 'data' target = 'DepDelay' log = '' results = {} df = get_df(columns).dropna() is_dask = True client = None if is_dask: client = Client(n_workers=20, threads_per_worker=20, memory_limit='1GB') model = GradientBoostingRegressor(random_state=18) params = {'max_depth': [2, 3], 'n_estimators': [1, 2, 3]} X_train, X_test, y_train, y_test = get_data(df.copy(), target, is_dask=False, chunksize=200) results = dict() clf_name = type(model).__name__ clf_cv = GridSearchCV(model, param_grid=params, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=18), scoring='neg_mean_squared_error') with joblib.parallel_backend("dask" if is_dask else 'loky'): start = time.time() clf_cv.fit(X_train, y_train) end = time.time() y_predict_train = clf_cv.best_estimator_.predict(X_train) y_predict_test = clf_cv.best_estimator_.predict(X_test) train_error = mean_squared_error( y_train, y_predict_train, ) test_error = mean_squared_error( y_test, y_predict_test, ) best_params = clf_cv.best_params_ results['Scikit XGBoost'] = { 'train_error': train_error, 'test_error': test_error, 'time': end - start } log += 'Scikit XGBoost train_error: %.2f, test_error: %.2f, took: %.2f\n' % ( train_error, test_error, end - start) is_dask = True X_train, X_test, y_train, y_test = get_data(df.copy(), target, is_dask=is_dask, chunksize=200) params = { 'objective': 'reg:squarederror', 'max_depth': 3, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 0.2 } start = time.time() bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=10) end = time.time() y_train_pred = dask_xgboost.predict(client, bst, X_train).persist() y_test_pred = dask_xgboost.predict(client, bst, X_test).persist() y_train, y_train_pred = dask.compute(y_train, y_train_pred) y_test, y_test_pred = dask.compute(y_test, y_test_pred) train_error = mean_squared_error(y_train, y_train_pred) test_error = mean_squared_error(y_test, y_test_pred) log += 'Dask XGBoost train_error: %.2f, test_error: %.2f, took: %.2f' % ( train_error, test_error, end - start) results['Dask XGBoost'] = { 'train_error': train_error, 'test_error': test_error, 'time': end - start } with open('results.txt', 'w') as outfile: json.dump(results, outfile) print('Finished!')
def task(df, ram_to_use, is_dask): client = None if is_dask: client = Client(threads_per_worker=10, n_workers=10, memory_limit=''.join([str(ram_to_use), 'GB'])) models = [ Ridge(random_state=42), GradientBoostingRegressor(random_state=42), ][:1 if is_dask else 2] params = [ { "alpha": [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0], }, { 'max_depth': [2, 3, 4, 6], 'n_estimators': [2, 3, 4, 5], }, ][:1 if is_dask else 2] X_train, X_test, y_train, y_test = get_dask_data( df.copy(), 'DepDelay') if is_dask else get_normal_data( df.copy(), 'DepDelay') for model, param in zip(models, params): t_start = time.time() results, _, _ = run_single_model(model, param, X_train, X_test, y_train, y_test, is_dask=is_dask) model_name = type(model).__name__ train_error, test_error = results[model_name]['metric'][ 'mean_squared_error'] t_end = time.time() time_took = round(t_end - t_start, 3) dict_saver = {} dict_saver.update( {'model_name': model_name + ('_dask' if is_dask else '')}) dict_saver.update({'train_error(MSE)': train_error}) dict_saver.update({'test_error(MSE)': test_error}) dict_saver.update({'time': time_took}) save_to_file(file_to_save_path, dict_saver) print(model_name, ':\t took ->', time_took, '\t with error (train, test)', (train_error, test_error)) if is_dask: params = { 'objective': 'reg:squarederror', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 0.5 } t_start = time.time() bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=10) t_end = time.time() time_took = round(t_end - t_start, 3) y_train_hat = dask_xgboost.predict(client, bst, X_train).persist() y_test_hat = dask_xgboost.predict(client, bst, X_test).persist() y_train, y_train_hat = dask.compute(y_train, y_train_hat) y_test, y_test_hat = dask.compute(y_test, y_test_hat) train_error = mean_squared_error(y_train, y_train_hat) test_error = mean_squared_error(y_test, y_test_hat) dict_saver = {} dict_saver.update({'model_name': 'Dask XGBoost' + '_dask'}) dict_saver.update({'train_error(MSE)': train_error}) dict_saver.update({'test_error(MSE)': test_error}) dict_saver.update({'time': time_took}) save_to_file(file_to_save_path, dict_saver) print('Dask XGBoost', ':\t took ->', time_took, '\t with error (train, test)', (train_error, test_error))