def train_ray(num_workers, num_boost_rounds, num_files=0, use_gpu=False): path = "/data/parted.parquet" if num_files: files = sorted(glob.glob(f"{path}/**/*.parquet")) while num_files > len(files): files = files + files path = files[0:num_files] use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_device_matrix: dtrain = RayDeviceQuantileDMatrix(path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) else: dtrain = RayDMatrix(path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) config = { "tree_method": "hist" if not use_gpu else "gpu_hist", "eval_metric": ["logloss", "error"], } start = time.time() evals_result = {} bst = train(config, dtrain, evals_result=evals_result, max_actor_restarts=2, num_boost_round=num_boost_rounds, num_actors=num_workers, cpus_per_actor=4, checkpoint_path="/tmp/checkpoint/", gpus_per_actor=0 if not use_gpu else 1, resources_per_actor={ "actor_cpus": 4, "actor_gpus": 0 if not use_gpu else 1 }, evals=[(dtrain, "train")]) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) print("Final training error: {:.4f}".format( evals_result["train"]["error"][-1])) return taken
def train_ray(path, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, ray_params=None, xgboost_params=None, **kwargs): if not os.path.exists(path): raise ValueError(f"Path does not exist: {path}") if num_files: files = sorted(glob.glob(f"{path}/**/*.parquet")) while num_files > len(files): files = files + files path = files[0:num_files] use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_device_matrix: dtrain = RayDeviceQuantileDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) else: dtrain = RayDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) config = {"tree_method": "hist" if not use_gpu else "gpu_hist"} if not regression: # Classification config.update({ "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }) else: # Regression config.update({ "objective": "reg:squarederror", "eval_metric": ["logloss", "rmse"], }) if xgboost_params: config.update(xgboost_params) start = time.time() evals_result = {} additional_results = {} bst = train( config, dtrain, evals_result=evals_result, additional_results=additional_results, num_boost_round=num_boost_rounds, ray_params=ray_params or RayParams( max_actor_restarts=2, num_actors=num_workers, cpus_per_actor=1, gpus_per_actor=1 if not use_gpu else 1), evals=[(dtrain, "train")], **kwargs) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) print("Final training error: {:.4f}".format( evals_result["train"]["error"][-1])) return bst, additional_results, taken
def train_ray(train_files, eval_files, num_workers, num_boost_round, regression=False, use_gpu=False, ray_params=None, xgboost_params=None, ft_manager=None, aws=None, **kwargs): use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_gpu and use_device_matrix: dtrain = RayDeviceQuantileDMatrix(train_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) deval = RayDeviceQuantileDMatrix(eval_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) else: dtrain = RayDMatrix(train_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) deval = RayDMatrix(eval_files, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) config = xgboost_params or {"tree_method": "hist"} if use_gpu: config.update({"tree_method": "gpu_hist"}) if not regression: # Classification config.update({ "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }) return_metric = "error" else: # Regression config.update({ "objective": "reg:squarederror", "eval_metric": ["logloss", "rmse"], }) return_metric = "rmse" xgboost_callbacks = [] distributed_callbacks = [] if ft_manager: delay_callback = DelayedLoadingCallback(ft_manager, reload_data=True, sleep_time=0.1) distributed_callbacks.append(delay_callback) die_callback = DieCallback(ft_manager, training_delay=0.1) xgboost_callbacks.append(die_callback) if aws: aws_callback = EnvironmentCallback(aws) distributed_callbacks.append(aws_callback) os.environ.update(aws) ray_params = ray_params or RayParams() ray_params.num_actors = num_workers ray_params.gpus_per_actor = 0 if not use_gpu else 1 ray_params.distributed_callbacks = distributed_callbacks evals_result = {} additional_results = {} bst = train(config, dtrain, evals_result=evals_result, additional_results=additional_results, num_boost_round=num_boost_round, ray_params=ray_params, evals=[(dtrain, "train"), (deval, "eval")], callbacks=xgboost_callbacks, **kwargs) bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) print("Final training error: {:.4f}".format( evals_result["train"][return_metric][-1])) results = { "train-logloss": evals_result["train"]["logloss"][-1], f"train-{return_metric}": evals_result["train"][return_metric][-1], "eval-logloss": evals_result["eval"]["logloss"][-1], f"eval-{return_metric}": evals_result["eval"][return_metric][-1], "total_n": additional_results["total_n"] } return bst, results
def train_ray( path, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, ray_params=None, xgboost_params=None, **kwargs, ): if not isinstance(path, list): path = get_parquet_files(path, num_files=num_files) use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_device_matrix: dtrain = RayDeviceQuantileDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) else: dtrain = RayDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET, ) config = {"tree_method": "hist" if not use_gpu else "gpu_hist"} if not regression: # Classification config.update( { "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } ) else: # Regression config.update( { "objective": "reg:squarederror", "eval_metric": ["logloss", "rmse"], } ) if xgboost_params: config.update(xgboost_params) start = time.time() evals_result = {} additional_results = {} bst = train( config, dtrain, evals_result=evals_result, additional_results=additional_results, num_boost_round=num_boost_rounds, ray_params=ray_params or RayParams( max_actor_restarts=2, num_actors=num_workers, cpus_per_actor=1, gpus_per_actor=1 if not use_gpu else 1, ), evals=[(dtrain, "train")], **kwargs, ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") out_file = os.path.expanduser( "~/benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu") ) bst.save_model(out_file) print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) return bst, additional_results, taken