def test_tuner_with_torch_trainer(self): """Test a successful run using torch trainer.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True) # The following two should be tunable. config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10} scaling_config = {"num_workers": 1, "use_gpu": False} trainer = TorchTrainer( train_loop_per_worker=linear_train_func, train_loop_config=config, scaling_config=scaling_config, ) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, "train_loop_config": { "batch_size": tune.grid_search([4, 8]), "epochs": tune.grid_search([5, 10]), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="loss"), ) results = tuner.fit() assert len(results) == 8
def __init__( self, restore_path: str = None, trainable: Optional[Union[str, Callable, Type[Trainable], BaseTrainer, ]] = None, param_space: Optional[Dict[str, Any]] = None, tune_config: Optional[TuneConfig] = None, run_config: Optional[RunConfig] = None, _tuner_kwargs: Optional[Dict] = None, ): # Restored from Tuner checkpoint. if restore_path: trainable_ckpt = os.path.join(restore_path, _TRAINABLE_PKL) with open(trainable_ckpt, "rb") as fp: trainable = pickle.load(fp) tuner_ckpt = os.path.join(restore_path, _TUNER_PKL) with open(tuner_ckpt, "rb") as fp: tuner = pickle.load(fp) self.__dict__.update(tuner.__dict__) self._is_restored = True self._trainable = trainable self._experiment_checkpoint_dir = restore_path return # Start from fresh if not trainable: raise TuneError("You need to provide a trainable to tune.") # If no run config was passed to Tuner directly, use the one from the Trainer, # if available if not run_config and isinstance(trainable, BaseTrainer): run_config = trainable.run_config self._is_restored = False self._trainable = trainable self._tune_config = tune_config or TuneConfig() self._run_config = run_config or RunConfig() self._tuner_kwargs = copy.deepcopy(_tuner_kwargs) or {} self._experiment_checkpoint_dir = self._setup_create_experiment_checkpoint_dir( self._run_config) # Not used for restored Tuner. self._param_space = param_space or {} # This needs to happen before `tune.run()` is kicked in. # This is because currently tune does not exit gracefully if # run in ray client mode - if crash happens, it just exits immediately # without allowing for checkpointing tuner and trainable. # Thus this has to happen before tune.run() so that we can have something # to restore from. tuner_ckpt = os.path.join(self._experiment_checkpoint_dir, _TUNER_PKL) with open(tuner_ckpt, "wb") as fp: pickle.dump(self, fp) trainable_ckpt = os.path.join(self._experiment_checkpoint_dir, _TRAINABLE_PKL) with open(trainable_ckpt, "wb") as fp: pickle.dump(self._trainable, fp)
def tune_linear(num_workers, num_samples, use_gpu): train_dataset, val_dataset = get_datasets() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} scaling_config = {"num_workers": num_workers, "use_gpu": use_gpu} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, datasets={ "train": train_dataset, "validation": val_dataset }, ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([4, 16, 32]), "epochs": 3, } }, tune_config=TuneConfig(num_samples=num_samples, metric="loss", mode="min"), ) result_grid = tuner.fit() best_result = result_grid.get_best_result() print(best_result) return best_result
def tune_horovod(num_workers, num_samples, use_gpu, mode="square", x_max=1.0): horovod_trainer = HorovodTrainer( train_loop_per_worker=train_loop_per_worker, scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu }, train_loop_config={ "mode": mode, "x_max": x_max }, ) tuner = Tuner( horovod_trainer, param_space={"train_loop_config": { "lr": tune.uniform(0.1, 1) }}, tune_config=TuneConfig(mode="min", metric="loss", num_samples=num_samples), _tuner_kwargs={"fail_fast": True}, ) result_grid = tuner.fit() print("Best hyperparameters found were: ", result_grid.get_best_result().config)
def test_data_parallel_trainer(ray_start_8_cpus): num_workers = 2 trainer = AssertingDataParallelTrainer( train_fn, scaling_config=ScalingConfig(num_workers=num_workers) ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "num_epochs": 100, "metric": tune.grid_search([1, 2, 3, 4, 5]), } }, tune_config=TuneConfig( mode="max", metric="metric", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid) # + 1 for Trainable assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
def test_tuner_trainer_fail(self): trainer = FailingTrainer() param_space = { "scaling_config": ScalingConfig(num_workers=tune.grid_search([1, 2])) } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_trainer_fail"), param_space=param_space, tune_config=TuneConfig(mode="max", metric="iteration"), ) results = tuner.fit() assert len(results) == 2 for i in range(2): assert results[i].error
def test_tuner_with_xgboost_trainer(self): """Test a successful run.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"), ignore_errors=True) trainer = XGBoostTrainer( label_column="target", params={}, datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # "preprocessor": tune.grid_search([prep_v1, prep_v2]), "datasets": { "train": tune.grid_search( [gen_dataset_func(), gen_dataset_func(do_shuffle=True)]), }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), # limiting the number of trials running at one time. # As the unit test only has access to 4 CPUs on Buildkite. _tuner_kwargs={"max_concurrent_trials": 1}, ) results = tuner.fit() assert not isinstance(results.get_best_result().checkpoint, TrialCheckpoint) assert len(results) == 4
def test_tuner_with_xgboost_trainer(self): """Test a successful run.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner"), ignore_errors=True) trainer = XGBoostTrainer( label_column="target", params={}, # TODO(xwjiang): change when dataset out-of-band ser/des is landed. datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363 # is resolved. # "preprocessor": tune.grid_search([prep_v1, prep_v2]), # "datasets": { # "train": tune.choice( # [gen_dataset_func(), gen_dataset_func(do_shuffle=True)] # ), # }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), ) results = tuner.fit() assert not isinstance(results.get_best_result().checkpoint, TrialCheckpoint) assert len(results) == 2
def tune_tensorflow_mnist(num_workers, num_samples): trainer = TensorflowTrainer( train_func, scaling_config=ScalingConfig(num_workers=num_workers)) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 3, }, }, tune_config=TuneConfig(num_samples=num_samples), ) analysis = tuner.fit() best_loss = analysis.get_best_result(metric="loss", mode="min") best_accuracy = analysis.get_best_result(metric="accuracy", mode="max") print(f"Best loss result: {best_loss}") print(f"Best accuracy result: {best_accuracy}") return analysis
def test_tune(ray_start_4_cpus): def train_func(config): session.report({"loss": config["x"]}) trainer = DataParallelTrainer( train_loop_per_worker=train_func, train_loop_config={"x": 100}, scaling_config=scale_config, ) tuner = Tuner( trainer, param_space={"train_loop_config": {"x": tune.choice([200, 300])}}, tune_config=TuneConfig(num_samples=2), ) result_grid = tuner.fit() assert result_grid[0].metrics["loss"] in [200, 300] # Make sure original Trainer is not affected. assert trainer._train_loop_config["x"] == 100
def test_tuner_trainer_fail(self): class DummyTrainer(Trainer): def training_loop(self) -> None: raise RuntimeError("There is an error in trainer!") trainer = DummyTrainer() param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), } } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_trainer_fail"), param_space=param_space, tune_config=TuneConfig(mode="max", metric="iteration"), ) results = tuner.fit() assert len(results) == 2 for i in range(2): assert results[i].error
def test_tuner_with_torch_trainer(self): """Test a successful run using torch trainer.""" shutil.rmtree( os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True ) # The following two should be tunable. config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10} scaling_config = {"num_workers": 1, "use_gpu": False} trainer = TorchTrainer( train_loop_per_worker=linear_train_func, train_loop_config=config, scaling_config=scaling_config, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363 # is resolved. # "preprocessor": tune.grid_search([prep_v1, prep_v2]), # "datasets": { # "train": tune.choice( # [gen_dataset_func(), gen_dataset_func(do_shuffle=True)] # ), # }, "train_loop_config": { "batch_size": tune.grid_search([4, 8]), "epochs": tune.grid_search([5, 10]), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="loss"), ) results = tuner.fit() assert len(results) == 8
def tune_tensorflow_mnist(num_workers: int = 2, num_samples: int = 2, use_gpu: bool = False): trainer = TensorflowTrainer( train_loop_per_worker=train_func, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) tuner = Tuner( trainer, tune_config=TuneConfig(num_samples=num_samples, metric="accuracy", mode="max"), param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 3, } }, ) best_accuracy = tuner.fit().get_best_result().metrics["accuracy"] print(f"Best accuracy config: {best_accuracy}")
def torch_fashion_mnist(num_workers, use_gpu, num_samples): trainer = TorchTrainer( fashion_mnist_train_func, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 2, } }, tune_config=TuneConfig( num_samples=num_samples, ), ) analysis = tuner.fit()._experiment_analysis # Check that loss decreases in each trial. for path, df in analysis.trial_dataframes.items(): assert df.loc[1, "loss"] < df.loc[0, "loss"]
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker): from ray import tune from ray.tune.tuner import Tuner, TuneConfig num_samples = 2 @patch("torch.cuda.is_available", lambda: True) def train_func(): train.report(device_id=train.torch.get_device().index) trainer = TorchTrainer( train_func, torch_config=TorchConfig(backend="gloo"), scaling_config=ScalingConfig( num_workers=2, use_gpu=True, resources_per_worker={"GPU": num_gpus_per_worker}, ), ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "dummy": tune.choice([32, 64, 128]), } }, tune_config=TuneConfig( num_samples=num_samples, ), ) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs] assert len(device_ids) == num_samples for i in range(num_samples): assert device_ids[i][0] == 0
def test_gbdt_trainer(ray_start_8_cpus): data_raw = load_breast_cancer() dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"]) dataset_df["target"] = data_raw["target"] train_ds = ray.data.from_pandas(dataset_df).repartition(16) trainer = AssertingXGBoostTrainer( datasets={TRAIN_DATASET_KEY: train_ds}, label_column="target", scaling_config=ScalingConfig(num_workers=2), params={ "objective": "binary:logistic", "eval_metric": ["logloss"], }, ) tuner = Tuner( trainer, param_space={ "num_boost_round": 100, "params": { "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]), }, }, tune_config=TuneConfig( mode="min", metric="train-logloss", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid)
def test_tuner_with_xgboost_trainer_driver_fail_and_resume(self): # So that we have some global checkpointing happening. os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "1" shutil.rmtree( os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail"), ignore_errors=True, ) trainer = XGBoostTrainer( label_column="target", params={}, # TODO(xwjiang): change when dataset out-of-band ser/des is landed. datasets={"train": gen_dataset_func_eager()}, ) # prep_v1 = StandardScaler(["worst radius", "worst area"]) # prep_v2 = StandardScaler(["worst concavity", "worst smoothness"]) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, # TODO(xwjiang): Add when https://github.com/ray-project/ray/issues/23363 # is resolved. # "preprocessor": tune.grid_search([prep_v1, prep_v2]), # "datasets": { # "train": tune.choice( # [gen_dataset_func(), gen_dataset_func(do_shuffle=True)] # ), # }, "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } class FailureInjectionCallback(Callback): """Inject failure at the configured iteration number.""" def __init__(self, num_iters=10): self.num_iters = num_iters def on_step_end(self, iteration, trials, **kwargs): if iteration == self.num_iters: print(f"Failing after {self.num_iters} iters.") raise RuntimeError tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner_driver_fail", callbacks=[FailureInjectionCallback()]), param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), ) with self.assertRaises(TuneError): tuner.fit() # Test resume restore_path = os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail") tuner = Tuner.restore(restore_path) # A hack before we figure out RunConfig semantics across resumes. tuner._local_tuner._run_config.callbacks = None results = tuner.fit() assert len(results) == 2
trainer, param_space={ "train_loop_config": { "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None, "test_mode": args.smoke_test, "batch_size": 128 * num_training_workers, # For the long running test, we want the training to run forever, # and it will be terminated by the release test infra. "epochs": 1 if args.smoke_test else sys.maxsize, } }, tune_config=TuneConfig(num_samples=4, metric="loss", mode="min", scheduler=pbt_scheduler), run_config=RunConfig( stop={"training_iteration": 1} if args.smoke_test else None, callbacks=[ FailureInjectorCallback(time_between_checks=90), ProgressCallback() ], ), ) results = tuner.fit() print(results.get_best_result(metric="loss", mode="min"))
), ( { "run_config": RunConfig(reuse_actors=True) }, lambda kw: kw["reuse_actors"] is True, ), ( { "run_config": RunConfig(log_to_file="some_file") }, lambda kw: kw["log_to_file"] == "some_file", ), ( { "tune_config": TuneConfig(max_concurrent_trials=3) }, lambda kw: kw["max_concurrent_trials"] == 3, ), ( { "tune_config": TuneConfig(time_budget_s=60) }, lambda kw: kw["time_budget_s"] == 60, ), ], ) def test_tuner_api_kwargs(params_expected): tuner_params, assertion = params_expected tuner = Tuner(lambda config: 1, **tuner_params)
}, }, ) tuner = Tuner( horovod_trainer, param_space={ "train_loop_config": { "lr": 0.1 if args.smoke_test else tune.grid_search( [0.1 * i for i in range(1, 10)]) } }, tune_config=TuneConfig( num_samples=2 if args.smoke_test else 1, metric="loss", mode="min", scheduler=pbt, ), run_config=RunConfig( stop={"training_iteration": 1} if args.smoke_test else None, callbacks=[ProgressCallback()], ), _tuner_kwargs={ "fail_fast": False, "keep_checkpoints_num": 1 }, ) result_grid = tuner.fit() # Make sure trials do not fail.
# Manually iterate over the data 10 times (10 epochs). for _ in range(10): for batch in data_shard.iter_batches(): print("Do some training on batch", batch) trainer = TorchTrainer( train_loop_per_worker, scaling_config=ScalingConfig(num_workers=1), datasets={"train": ray.data.range_tensor(1000)}, ) param_space = { "scaling_config": ScalingConfig(num_workers=tune.grid_search([1, 2])), "params": { "objective": "binary:logistic", "tree_method": "approx", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), }, } tuner = Tuner( trainable=trainer, param_space=param_space, tune_config=TuneConfig(mode="min", metric="train-error"), ) results = tuner.fit() # __config_scaling_2_end__