def tune_xgboost(train_df, test_df, target_column): # Set XGBoost config. config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } ray_params = RayParams(max_actor_restarts=1, gpus_per_actor=0, cpus_per_actor=4, num_actors=4) analysis = tune.run( tune.with_parameters(train_xgboost, train_df=train_df, test_df=test_df, target_column=target_column, ray_params=ray_params), # Use the `get_tune_resources` helper function to set the resources. resources_per_trial=ray_params.get_tune_resources(), config=config, num_samples=1, metric="eval-error", mode="min", verbose=1) accuracy = 1. - analysis.best_result["eval-error"] print(f"Best model parameters: {analysis.best_config}") print(f"Best model total accuracy: {accuracy:.4f}") return analysis.best_config
def main(cpus_per_actor, num_actors, num_samples): # Set XGBoost config. config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } ray_params = RayParams(max_actor_restarts=1, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors) analysis = tune.run( tune.with_parameters(train_breast_cancer, ray_params=ray_params), # Use the `get_tune_resources` helper function to set the resources. resources_per_trial=ray_params.get_tune_resources(), config=config, num_samples=num_samples, metric="eval-error", mode="min") # Load the best model checkpoint. best_bst = xgboost_ray.tune.load_model( os.path.join(analysis.best_logdir, "tuned.xgb")) best_bst.save_model("best_model.xgb") accuracy = 1. - analysis.best_result["eval-error"] print(f"Best model parameters: {analysis.best_config}") print(f"Best model total accuracy: {accuracy:.4f}")
def main(): name = "large xgboost sweep" ray.init(address="auto") num_samples = 31 # So that we fit on 1024 CPUs with 1 head bundle num_actors_per_sample = 32 max_runtime = 3500 config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": 4, } ray_params = RayParams( max_actor_restarts=1, gpus_per_actor=0, cpus_per_actor=1, num_actors=num_actors_per_sample, ) start_time = time.monotonic() analysis = tune.run( tune.with_parameters(xgboost_train, ray_params=ray_params, num_boost_round=100), config=config, num_samples=num_samples, resources_per_trial=ray_params.get_tune_resources(), ) time_taken = time.monotonic() - start_time result = { "time_taken": time_taken, "trial_states": dict(Counter([trial.status for trial in analysis.trials])), "last_update": time.time(), } test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/tune_test.json") with open(test_output_json, "wt") as f: json.dump(result, f) if time_taken > max_runtime: print(f"The {name} test took {time_taken:.2f} seconds, but should not " f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" f"--- FAILED: {name.upper()} ::: " f"{time_taken:.2f} > {max_runtime:.2f} ---") else: print(f"The {name} test took {time_taken:.2f} seconds, which " f"is below the budget of {max_runtime:.2f} seconds. " f"Test successful. \n\n" f"--- PASSED: {name.upper()} ::: " f"{time_taken:.2f} <= {max_runtime:.2f} ---")
def testElasticFails(self): """Test if error is thrown when using Tune with elastic training.""" ray_params = RayParams(cpus_per_actor=1, num_actors=1, elastic_training=True) with self.assertRaises(TuneError): tune.run(self.train_func(ray_params), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1)
def testNumIters(self): """Test that the number of reported tune results is correct""" ray_params = RayParams(cpus_per_actor=1, num_actors=2) analysis = tune.run( self.train_func(ray_params), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=2) self.assertSequenceEqual( list(analysis.results_df["training_iteration"]), list(analysis.results_df["config.num_boost_round"]))
def test_tune_pack(self): """Tests whether workers are packed when using Tune.""" try: from ray import tune except ImportError: self.skipTest("Tune is not installed.") return with self.ray_start_cluster() as cluster: num_actors = 2 cluster.add_node(num_cpus=3) cluster.add_node(num_cpus=3) ray.init(address=cluster.address) ray_params = RayParams(max_actor_restarts=1, num_actors=num_actors, cpus_per_actor=1) def _mock_train(*args, _training_state, **kwargs): try: results = _train(*args, _training_state=_training_state, **kwargs) return results except Exception: raise finally: assert len(_training_state.actors) == num_actors if not any(a is None for a in _training_state.actors): actor_infos = ray.state.actors() actor_nodes = [] for a in _training_state.actors: actor_info = actor_infos.get(a._actor_id.hex()) actor_node = actor_info["Address"]["NodeID"] actor_nodes.append(actor_node) assert actor_nodes[0] == actor_nodes[1] def train_func(params, x, y, ray_params): def inner_func(config): with patch("xgboost_ray.main._train", _mock_train): train(params, RayDMatrix(x, y), num_boost_round=4, ray_params=ray_params) return inner_func tune.run( train_func(self.params, self.x, self.y, ray_params), resources_per_trial=ray_params.get_tune_resources(), num_samples=1, )
def testEndToEndCheckpointingOrigTune(self): ray_params = RayParams(cpus_per_actor=1, num_actors=2) analysis = tune.run( self.train_func(ray_params, callbacks=[OrigTuneReportCheckpointCallback()]), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, metric="train-mlogloss", mode="min", log_to_file=True, local_dir=self.experiment_dir) self.assertTrue(os.path.exists(analysis.best_checkpoint))
def readme_tune(): from xgboost_ray import RayDMatrix, RayParams, train from sklearn.datasets import load_breast_cancer num_actors = 4 num_cpus_per_actor = 1 ray_params = RayParams( num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) def train_model(config): train_x, train_y = load_breast_cancer(return_X_y=True) train_set = RayDMatrix(train_x, train_y) evals_result = {} bst = train( params=config, dtrain=train_set, evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, ray_params=ray_params) bst.save_model("model.xgb") from ray import tune # Specify the hyperparameter search space. config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } # Make sure to use the `get_tune_resources` method to set the `resources_per_trial` analysis = tune.run( train_model, config=config, metric="train-error", mode="min", num_samples=4, resources_per_trial=ray_params.get_tune_resources()) print("Best hyperparameters", analysis.best_config)
def main(): name = "large xgboost sweep" ray.init(address="auto") num_samples = 32 num_actors_per_sample = 32 max_runtime = 3500 config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": 4 } ray_params = RayParams( max_actor_restarts=1, gpus_per_actor=0, cpus_per_actor=1, num_actors=num_actors_per_sample) start_time = time.monotonic() tune.run( tune.with_parameters( xgboost_train, ray_params=ray_params, num_boost_round=100), config=config, num_samples=num_samples, resources_per_trial=ray_params.get_tune_resources()) time_taken = time.monotonic() - start_time assert time_taken < max_runtime, \ f"The {name} test took {time_taken:.2f} seconds, but should not " \ f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" \ f"--- FAILED: {name.upper()} ::: " \ f"{time_taken:.2f} > {max_runtime:.2f} ---" print(f"The {name} test took {time_taken:.2f} seconds, which " f"is below the budget of {max_runtime:.2f} seconds. " f"Test successful. \n\n" f"--- PASSED: {name.upper()} ::: " f"{time_taken:.2f} <= {max_runtime:.2f} ---")
def tune_test(path, num_trials, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, fake_data=False, smoke_test=False): ray_params = RayParams(elastic_training=False, max_actor_restarts=0, num_actors=num_workers, cpus_per_actor=1, gpus_per_actor=0 if not use_gpu else 1) def local_train(config): temp_dir = None if fake_data or smoke_test: temp_dir = "/tmp/release_test_data" if os.path.exists(temp_dir): shutil.rmtree(temp_dir) os.makedirs(temp_dir, 0o755) local_path = os.path.join(temp_dir, "smoketest.parquet") create_parquet(filename=local_path, num_rows=args.num_workers * 500, num_features=4, num_classes=2, num_partitions=args.num_workers * 10) else: if not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." f"\nFIX THIS by running `python create_test_data.py` " f"on all nodes first.") local_path = path xgboost_params = { "tree_method": "hist" if not use_gpu else "gpu_hist", } xgboost_params.update({ "objective": "binary:logistic", "eval_metric": ["logloss", "error"], }) xgboost_params.update(config) additional_results = {} bst, time_taken = train_ray( path=local_path, num_workers=num_workers, num_boost_rounds=num_boost_rounds, num_files=num_files, regression=regression, use_gpu=use_gpu, smoke_test=smoke_test, ray_params=ray_params, xgboost_params=xgboost_params, # kwargs additional_results=additional_results, callbacks=[PlacementCallback(), TuneReportCallback()]) bst.save_model("tuned.xgb") trial_ips = [] for rank, ips in enumerate(additional_results["callback_returns"]): for ip in ips: trial_ips.append(ip) tune_trial = get_trial_id() with tune.checkpoint_dir(num_boost_rounds + 1) as checkpoint_dir: with open(os.path.join(checkpoint_dir, "callback_returns.json"), "wt") as f: json.dump({tune_trial: trial_ips}, f) if temp_dir: shutil.rmtree(temp_dir) search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } analysis = tune.run( local_train, config=search_space, num_samples=num_trials, sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer), resources_per_trial=ray_params.get_tune_resources()) # In our PACK scheduling, we expect that each IP hosts only workers # for one Ray Tune trial. ip_to_trials = defaultdict(list) for trial in analysis.trials: trial = trial with open( os.path.join(trial.checkpoint.value, "callback_returns.json"), "rt") as f: trial_to_ips = json.load(f) for tune_trial, ips in trial_to_ips.items(): for node_ip in ips: ip_to_trials[node_ip].append(tune_trial) fail = False for ip, trial_ids in ip_to_trials.items(): print(f"For IP {ip} got trial IDs {trial_ids}") fail = fail or any(trial_id != trial_ids[0] for trial_id in trial_ids) if fail: raise ValueError("Different trial IDs found on same node.") else: print("Success.")
num_files=25, regression=False, use_gpu=False, ray_params=ray_params, xgboost_params=config, ) if __name__ == "__main__": search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } ray.init(address="auto") ray_params = RayParams(elastic_training=False, max_actor_restarts=2, num_actors=4, cpus_per_actor=1, gpus_per_actor=0) analysis = tune.run(tune.with_parameters(train_wrapper, ray_params=ray_params), config=search_space, num_samples=4, resources_per_trial=ray_params.get_tune_resources()) print("PASSED.")