def main(client): m = 100000 n = 100 X, y = make_regression(n_samples=m, n_features=n, chunks=200, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) dtrain = DaskDMatrix(client, X_train, y_train) dtest = DaskDMatrix(client, X_test, y_test) output = xgb.dask.train( client, { "verbosity": 1, "tree_method": "hist", "objective": "reg:squarederror", "eval_metric": "rmse", "max_depth": 6, "learning_rate": 1.0, }, dtrain, num_boost_round=1000, evals=[(dtrain, "train"), (dtest, "test")], callbacks=[ CustomEarlyStopping(validation_set="test", target_metric="rmse", maximize=False, seed=0) ], )
def using_dask_matrix(client: Client, X, y): # DaskDMatrix acts like normal DMatrix, works as a proxy for local # DMatrix scatter around workers. dtrain = DaskDMatrix(client, X, y) # Use train method from xgboost.dask instead of xgboost. This # distributed version of train returns a dictionary containing the # resulting booster and evaluation history obtained from # evaluation metrics. output = xgb.dask.train( client, { 'verbosity': 2, # Golden line for GPU training 'tree_method': 'gpu_hist' }, dtrain, num_boost_round=4, evals=[(dtrain, 'train')]) bst = output['booster'] history = output['history'] # you can pass output directly into `predict` too. prediction = xgb.dask.predict(client, bst, dtrain) print('Evaluation history:', history) return prediction
def test_from_dask_dataframe(): with LocalCluster(n_workers=5) as cluster: with Client(cluster) as client: X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) dtrain = DaskDMatrix(client, X, y) booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster'] prediction = xgb.dask.predict(client, model=booster, data=dtrain) assert prediction.ndim == 1 assert isinstance(prediction, da.Array) assert prediction.shape[0] == kRows with pytest.raises(ValueError): # evals_result is not supported in dask interface. xgb.dask.train(client, {}, dtrain, num_boost_round=2, evals_result={}) # force prediction to be computed prediction = prediction.compute()
def main(client): # generate some random data for demonstration m = 100000 n = 100 X = da.random.random(size=(m, n), chunks=100) y = da.random.random(size=(m, ), chunks=100) # DaskDMatrix acts like normal DMatrix, works as a proxy for local # DMatrix scatter around workers. dtrain = DaskDMatrix(client, X, y) # Use train method from xgboost.dask instead of xgboost. This # distributed version of train returns a dictionary containing the # resulting booster and evaluation history obtained from # evaluation metrics. output = xgb.dask.train(client, { 'verbosity': 1, 'nthread': 1, 'tree_method': 'hist' }, dtrain, num_boost_round=4, evals=[(dtrain, 'train')]) bst = output['booster'] history = output['history'] # you can pass output directly into `predict` too. prediction = xgb.dask.predict(client, bst, dtrain) print('Evaluation history:', history) return prediction
def main(client): n = 100 m = 100000 partition_size = 1000 X = da.random.random((m, n), partition_size) y = da.random.random(m, partition_size) # DaskDMatrix acts like normal DMatrix, works as a proxy for local # DMatrix scatter around workers. dtrain = DaskDMatrix(client, X, y) # Use train method from xgboost.dask instead of xgboost. This # distributed version of train returns a dictionary containing the # resulting booster and evaluation history obtained from # evaluation metrics. output = xgb.dask.train(client, { 'verbosity': 2, 'nthread': 1, 'tree_method': 'gpu_hist' }, dtrain, num_boost_round=4, evals=[(dtrain, 'train')]) bst = output['booster'] history = output['history'] prediction = xgb.dask.predict(client, bst, dtrain) print('Evaluation history:', history) return prediction
def test_predict(): with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() dtrain = DaskDMatrix(client, X, y) booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster'] pred = xgb.dask.predict(client, model=booster, data=dtrain) assert pred.ndim == 1 assert pred.shape[0] == kRows margin = xgb.dask.predict(client, model=booster, data=dtrain, output_margin=True) assert margin.ndim == 1 assert margin.shape[0] == kRows shap = xgb.dask.predict(client, model=booster, data=dtrain, pred_contribs=True) assert shap.ndim == 2 assert shap.shape[0] == kRows assert shap.shape[1] == kCols + 1
def test_from_dask_dataframe() -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y, _ = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) dtrain = DaskDMatrix(client, X, y) booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster'] prediction = xgb.dask.predict(client, model=booster, data=dtrain) assert prediction.ndim == 1 assert isinstance(prediction, da.Array) assert prediction.shape[0] == kRows with pytest.raises(TypeError): # evals_result is not supported in dask interface. xgb.dask.train( # type:ignore client, {}, dtrain, num_boost_round=2, evals_result={}) # force prediction to be computed from_dmatrix = prediction.compute() prediction = xgb.dask.predict(client, model=booster, data=X) from_df = prediction.compute() assert isinstance(prediction, dd.Series) assert np.all(prediction.compute().values == from_dmatrix) assert np.all(from_dmatrix == from_df.to_numpy()) series_predictions = xgb.dask.inplace_predict(client, booster, X) assert isinstance(series_predictions, dd.Series) np.testing.assert_allclose(series_predictions.compute().values, from_dmatrix)
def test_from_dask_array(): with LocalCluster(n_workers=kWorkers, threads_per_worker=5) as cluster: with Client(cluster) as client: X, y = generate_array() dtrain = DaskDMatrix(client, X, y) # results is {'booster': Booster, 'history': {...}} result = xgb.dask.train(client, {}, dtrain) prediction = xgb.dask.predict(client, result, dtrain) assert prediction.shape[0] == kRows assert isinstance(prediction, da.Array) # force prediction to be computed prediction = prediction.compute() booster = result['booster'] single_node_predt = booster.predict(xgb.DMatrix(X.compute())) np.testing.assert_allclose(prediction, single_node_predt) config = json.loads(booster.save_config()) assert int(config['learner']['generic_param']['nthread']) == 5 from_arr = xgb.dask.predict(client, model=booster, data=X) assert isinstance(from_arr, da.Array) assert np.all(single_node_predt == from_arr.compute())
def test_from_dask_array(client): X, y = generate_array() dtrain = DaskDMatrix(client, X, y) # results is {'booster': Booster, 'history': {...}} result = xgb.dask.train(client, {}, dtrain) prediction = xgb.dask.predict(client, result, dtrain) assert isinstance(prediction, da.Array)
def load_higgs_for_dask(client, X_t, X_v, y_t, y_v): ''' :param client: gpu设备 :param X_t: 训练集 :param X_v: 验证集 :param y_t: 训练集标签 :param y_v: 验证集标签 :return: dask.datafram格式的数据 ''' import dask.dataframe as dd # 1. Create a Dask Dataframe from Pandas Dataframe. ddf_higgs_train = dd.from_pandas(X_t, npartitions=8) ddf_higgs_test = dd.from_pandas(X_v, npartitions=8) ddf_y_train = dd.from_pandas(y_t, npartitions=8) ddf_y_test = dd.from_pandas(y_v, npartitions=8) # 2. Create Dask DMatrix Object using dask dataframes ddtrain = DaskDMatrix(client, ddf_higgs_train, ddf_y_train) ddtest = DaskDMatrix(client, ddf_higgs_test, ddf_y_test) return ddtrain, ddtest
def test_from_dask_array(client): X, y = generate_array() dtrain = DaskDMatrix(client, X, y) # results is {'booster': Booster, 'history': {...}} result = xgb.dask.train(client, {}, dtrain) prediction = xgb.dask.predict(client, result, dtrain) assert prediction.shape[0] == kRows assert isinstance(prediction, da.Array) prediction = prediction.compute() # force prediction to be computed
def main(client): # Load an example survival data from CSV into a Dask data frame. # The Veterans' Administration Lung Cancer Trial # The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980) CURRENT_DIR = os.path.dirname(__file__) df = dd.read_csv( os.path.join(CURRENT_DIR, os.pardir, 'data', 'veterans_lung_cancer.csv')) # DaskDMatrix acts like normal DMatrix, works as a proxy for local # DMatrix scatter around workers. # For AFT survival, you'd need to extract the lower and upper bounds for the label # and pass them as arguments to DaskDMatrix. y_lower_bound = df['Survival_label_lower_bound'] y_upper_bound = df['Survival_label_upper_bound'] X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'], axis=1) dtrain = DaskDMatrix(client, X, label_lower_bound=y_lower_bound, label_upper_bound=y_upper_bound) # Use train method from xgboost.dask instead of xgboost. This # distributed version of train returns a dictionary containing the # resulting booster and evaluation history obtained from # evaluation metrics. params = { 'verbosity': 1, 'objective': 'survival:aft', 'eval_metric': 'aft-nloglik', 'learning_rate': 0.05, 'aft_loss_distribution_scale': 1.20, 'aft_loss_distribution': 'normal', 'max_depth': 6, 'lambda': 0.01, 'alpha': 0.02 } output = xgb.dask.train(client, params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')]) bst = output['booster'] history = output['history'] # you can pass output directly into `predict` too. prediction = xgb.dask.predict(client, bst, dtrain) print('Evaluation history: ', history) # Uncomment the following line to save the model to the disk # bst.save_model('survival_model.json') return prediction
def train(seed, epochs, n_gpus, dataset): with LocalCUDACluster(n_workers=n_gpus, threads_per_worker=4) as cluster: with Client(cluster) as client: # Fetch dataset using sklearn if dataset == 'boston': dataset = load_boston() param = {} elif dataset == 'covertype': dataset = fetch_covtype() param = { 'objective': 'multi:softmax', 'num_class': 8 # 'single_precision_histogram': True } param['verbosity'] = 2 param['tree_method'] = 'gpu_hist' # Rechunking is required for the covertype dataset X = da.from_array(dataset.data, chunks=1000) y = da.from_array(dataset.target, chunks=1000) # Create 0.75/0.25 train/test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, train_size=0.75, random_state=0) dtrain = DaskDMatrix(client, X_train, y_train) dtest = DaskDMatrix(client, X_test, y_test) random_seed(seed, param) gpu_runtime = time.time() model_training_results = xgb.dask.train(client, param, dtrain, num_boost_round=epochs, evals=[(dtest, 'test')]) print(model_training_results) print(f'GPU Run Time: {str(time.time() - gpu_runtime)} seconds')
def main(client, train_dir, model_file, fs, do_wait=False): colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)] df = dd.read_csv(train_dir, header=None, names=colnames) X = df[df.columns.difference(['label'])] y = df['label'] print("[INFO]: ------ CSV files are read") if do_wait is True: df = df.persist() X = X.persist() wait(df) wait(X) print("[INFO]: ------ Long waited but the data is ready now") start_time = time.time() dtrain = DaskDMatrix(client, X, y) print("[INFO]: ------ QuantileDMatrix is formed in {} seconds ---".format( (time.time() - start_time))) del df del X del y start_time = time.time() output = xgb.dask.train(client, { 'verbosity': 2, 'learning_rate': 0.1, 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': 0.5, 'gamma': 0.9, 'verbose_eval': True, 'tree_method': 'hist', }, dtrain, num_boost_round=100, evals=[(dtrain, 'train')]) print("[INFO]: ------ Training is completed in {} seconds ---".format( (time.time() - start_time))) history = output['history'] print('[INFO]: ------ Training evaluation history:', history) output['booster'].save_model('/tmp/tmp.model') fs.put('/tmp/tmp.model', model_file) print("[INFO]: ------ Model saved here:{}".format(model_file))
def test_from_dask_dataframe(client): X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) dtrain = DaskDMatrix(client, X, y) booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster'] prediction = xgb.dask.predict(client, model=booster, data=dtrain) assert isinstance(prediction, da.Array) assert prediction.shape[0] == kRows, prediction with pytest.raises(ValueError): # evals_result is not supported in dask interface. xgb.dask.train(client, {}, dtrain, num_boost_round=2, evals_result={})
def test_global_config(self, client: "Client") -> None: X, y, _ = generate_array() xgb.config.set_config(verbosity=0) dtrain = DaskDMatrix(client, X, y) before_fname = './before_training-test_global_config' after_fname = './after_training-test_global_config' class TestCallback(xgb.callback.TrainingCallback): def write_file(self, fname: str) -> None: with open(fname, 'w') as fd: fd.write(str(xgb.config.get_config()['verbosity'])) def before_training(self, model: xgb.Booster) -> xgb.Booster: self.write_file(before_fname) assert xgb.config.get_config()['verbosity'] == 0 return model def after_training(self, model: xgb.Booster) -> xgb.Booster: assert xgb.config.get_config()['verbosity'] == 0 return model def before_iteration(self, model: xgb.Booster, epoch: int, evals_log: Dict) -> bool: assert xgb.config.get_config()['verbosity'] == 0 return False def after_iteration(self, model: xgb.Booster, epoch: int, evals_log: Dict) -> bool: self.write_file(after_fname) assert xgb.config.get_config()['verbosity'] == 0 return False xgb.dask.train(client, {}, dtrain, num_boost_round=4, callbacks=[TestCallback()])['booster'] with open(before_fname, 'r') as before, open(after_fname, 'r') as after: assert before.read() == '0' assert after.read() == '0' os.remove(before_fname) os.remove(after_fname)
def test_predict_with_meta(client): X, y, w = generate_array(with_weights=True) partition_size = 20 margin = da.random.random(kRows, partition_size) + 1e4 dtrain = DaskDMatrix(client, X, y, weight=w, base_margin=margin) booster = xgb.dask.train( client, {}, dtrain, num_boost_round=4)['booster'] prediction = xgb.dask.predict(client, model=booster, data=dtrain) assert prediction.ndim == 1 assert prediction.shape[0] == kRows prediction = client.compute(prediction).result() assert np.all(prediction > 1e3) m = xgb.DMatrix(X.compute()) m.set_info(label=y.compute(), weight=w.compute(), base_margin=margin.compute()) single = booster.predict(m) # Make sure the ordering is correct. assert np.all(prediction == single)
def main(client): n = 100 m = 100000 partition_size = 1000 X = da.random.random((m, n), partition_size) y = da.random.random(m, partition_size) dtrain = DaskDMatrix(client, X, y) output = xgb.dask.train(client, {'verbosity': 2, 'nthread': 1, 'tree_method': 'hist'}, dtrain, num_boost_round=4, evals=[(dtrain, 'train')]) bst = output['booster'] history = output['history'] prediction = xgb.dask.predict(client, bst, dtrain) print('Evaluation history:', history) return prediction
def test_xgboost_covtype_multi_gpu(): import xgboost as xgb import numpy as np from sklearn.model_selection import train_test_split import time from dask_cuda import LocalCUDACluster from dask.distributed import Client from dask import array as da import xgboost as xgb from xgboost.dask import DaskDMatrix from dask import array as da # Fetch dataset using sklearn cov = fetch_data() X = cov.data y = cov.target print(X.shape, y.shape) # Create 0.75/0.25 train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42) # Specify sufficient boosting iterations to reach a minimum num_round = 10 # Leave most parameters as default param = {'objective': 'multi:softmax', # Specify multiclass classification 'num_class': 8, # Number of possible output classes 'tree_method': 'gpu_hist', # Use GPU accelerated algorithm } from h2o4gpu.util.gpu import device_count n_gpus, devices = device_count(-1) with LocalCUDACluster(n_workers=n_gpus, threads_per_worker=1) as cluster: with Client(cluster) as client: # Convert input data from numpy to XGBoost format partition_size = 100000 # remove when https://github.com/dmlc/xgboost/issues/4987 is fixed dask_X_train = da.from_array(X_train, partition_size) dask_X_train = dask_X_train.persist() client.rebalance(dask_X_train) dask_label_train = da.from_array(y_train, partition_size) dask_label_train = dask_label_train.persist() client.rebalance(dask_label_train) dtrain = DaskDMatrix( client=client, data=dask_X_train, label=dask_label_train) dask_X_test = da.from_array(X_test, partition_size) dask_X_test = dask_X_test.persist() client.rebalance(dask_X_test) dask_label_test = da.from_array(y_test, partition_size) dask_label_test = dask_label_test.persist() client.rebalance(dask_label_test) dtest = DaskDMatrix( client=client, data=dask_X_test, label=dask_label_test) gpu_res = {} # Store accuracy result tmp = time.time() # Train model xgb.dask.train(client, param, dtrain, num_boost_round=num_round, evals=[ (dtest, 'test')]) print("GPU Training Time: %s seconds" % (str(time.time() - tmp))) # TODO: https://github.com/dmlc/xgboost/issues/4518 dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1) dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1) # Repeat for CPU algorithm tmp = time.time() param['tree_method'] = 'hist' cpu_res = {} xgb.train(param, dtrain, num_round, evals=[ (dtest, 'test')], evals_result=cpu_res) print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
def train_xgboost_gpu(X, y, data_chunksize=None, n_gpus=None, n_threads_per_gpu=1, params=None, xgboost_model=None, gpu_cluster=None, client=None): ''' Trains a XGBoost model on the GPU. :param X: a 2D matrix object of either type numpy ndarray or pandas DataFrame; :param y: a 1D array of one of the following types: numpy ndarray, pandas Series or pandas DataFrame; :param data_chunksize: number of rows to partition input data (both X and y simultaneously) to split among multiple GPU devices. Default value None splits evenly among devices; :param n_gpus: number of GPUs to be used. Default value None selects all available devices. :param n_threads_per_gpu: number of threads per GPU; :param params: xgboost trainning params as a python dict, refer to https://xgboost.readthedocs.io/en/latest/parameter.html :param xgboost_model: xgbooster object to continue training, it may be either a regular XGBoost model or a dask xgboost dict :param gpu_cluster: an existing dask cluster object to use. This param should be used if you call this method too many times in quick successions. Note that this function doesn't close an externally created cluster. :param client: an existing dask client object to use. This param should be used if you call this method too many times in quick successions. Note that this function doesn't close an externally created client. :return: A dictionary containing 2 keys: * 'booster': maps to a XGBoost model * 'history': maps to another dict which informs the history of the training process, as in the following the examṕle: {'train': {'logloss': ['0.48253', '0.35953']}, 'eval': {'logloss': ['0.480385', '0.357756']}}} ''' if gpu_cluster is None: local_gpus = LocalCUDACluster(n_workers=n_gpus, threads_per_worker=n_threads_per_gpu) else: local_gpus = gpu_cluster if client is None: local_dask_client = Client(local_gpus, {'verbose': 0}) else: local_dask_client = client if data_chunksize is None: data_chunksize = X.shape[0] // len(local_gpus.cuda_visible_devices) if params is None: params = { 'learning_rate': 0.3, 'max_depth': 8, 'objective': 'reg:squarederror', 'verbosity': 0, 'tree_method': 'gpu_hist' } if isinstance(X, pd.DataFrame): X = from_pandas(X, chunksize=data_chunksize) else: X = from_array(X, chunksize=data_chunksize) if isinstance(y, pd.DataFrame): y = from_pandas(y, chunksize=data_chunksize) else: y = from_array(y, chunksize=data_chunksize) dtrain = DaskDMatrix(local_dask_client, X, y) if type(xgboost_model) is dict: xgboost_model = xgboost_model['booster'] xgb_model = dask_xgboost_train(local_dask_client, params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')], xgb_model=xgboost_model) if client is None: local_dask_client.close() if gpu_cluster is None: local_gpus.close() return xgb_model