def testPlacementGroupFactoryEquality(self): """ Test that two different placement group factory objects are considered equal and evaluate to the same hash. """ from collections import Counter pgf_1 = PlacementGroupFactory([{ "CPU": 2, "GPU": 4, "custom": 7 }, { "GPU": 2, "custom": 1, "CPU": 3 }], "PACK", "no_name", None) pgf_2 = PlacementGroupFactory( [{ "custom": 7, "GPU": 4, "CPU": 2, }, { "custom": 1, "GPU": 2, "CPU": 3 }], strategy="PACK", name="no_name", lifetime=None) pgf_3 = PlacementGroupFactory( [{ "custom": 7, "GPU": 4, "CPU": 2.0, "custom2": 0 }, { "custom": 1.0, "GPU": 2, "CPU": 3, "custom2": 0 }], strategy="PACK", name="no_name", lifetime=None) self.assertEqual(pgf_1, pgf_2) self.assertEqual(pgf_2, pgf_3) # Hash testing counter = Counter() counter[pgf_1] += 1 counter[pgf_2] += 1 counter[pgf_3] += 1 self.assertEqual(counter[pgf_1], 3) self.assertEqual(counter[pgf_2], 3) self.assertEqual(counter[pgf_3], 3)
def testHasResourcesForTrialWithCaching(self): pgm = _PlacementGroupManager() pgf1 = PlacementGroupFactory([{"CPU": self.head_cpus}]) pgf2 = PlacementGroupFactory([{"CPU": self.head_cpus - 1}]) executor = RayTrialExecutor(reuse_actors=True) executor._pg_manager = pgm executor.set_max_pending_trials(1) def train(config): yield 1 yield 2 yield 3 yield 4 register_trainable("resettable", train) trial1 = Trial("resettable", placement_group_factory=pgf1) trial2 = Trial("resettable", placement_group_factory=pgf1) trial3 = Trial("resettable", placement_group_factory=pgf2) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert executor.has_resources_for_trial(trial3) executor._stage_and_update_status([trial1, trial2, trial3]) while not pgm.has_ready(trial1): time.sleep(1) executor._stage_and_update_status([trial1, trial2, trial3]) # Fill staging executor._stage_and_update_status([trial1, trial2, trial3]) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3) executor._start_trial(trial1) executor._stage_and_update_status([trial1, trial2, trial3]) executor.pause_trial( trial1) # Caches the PG and removes a PG from staging assert len(pgm._staging_futures) == 0 # This will re-schedule a placement group pgm.reconcile_placement_groups([trial1, trial2]) assert len(pgm._staging_futures) == 1 assert not pgm.can_stage() # We should still have resources for this trial as it has a cached PG assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3)
def example_resources_allocation_function( trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler", ) -> Union[None, PlacementGroupFactory, Resources]: """This is a basic example of a resource allocating function. The function naively balances available CPUs over live trials. This function returns a new ``PlacementGroupFactory`` with updated resource requirements, or None. If the returned ``PlacementGroupFactory`` is equal by value to the one the trial has currently, the scheduler will skip the update process internally (same with None). See :class:`DistributeResources` for a more complex, robust approach. Args: trial_runner (TrialRunner): Trial runner for this Tune run. Can be used to obtain information about other trials. trial (Trial): The trial to allocate new resources to. result (Dict[str, Any]): The latest results of trial. scheduler (ResourceChangingScheduler): The scheduler calling the function. """ # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler._base_trial_resources # Don't bother if this is just the first iteration if result["training_iteration"] < 1: return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) # Assume that the number of CPUs cannot go below what was # specified in tune.run min_cpu = base_trial_resource.required_resources.get("CPU", 0) # Get the number of CPUs available in total (not just free) total_available_cpus = ( trial_runner.trial_executor._resource_updater.get_num_cpus()) # Divide the free CPUs among all live trials cpu_to_use = max( min_cpu, total_available_cpus // len(trial_runner.get_live_trials())) # Assign new CPUs to the trial in a PlacementGroupFactory return PlacementGroupFactory([{"CPU": cpu_to_use, "GPU": 0}])
def testResourceDeadlock(self): """Tests that resource deadlock is avoided for heterogeneous PGFs. We start 4 trials in a cluster with 2 CPUs. The first two trials require 1 CPU each, the third trial 2 CPUs, the fourth trial 1 CPU. The second trial needs a bit more time to finish. This means that the resources from the first trial will be freed, and the PG of the _fourth_ trial becomes ready (not that of the third trial, because that requires 2 CPUs - however, one is still occupied by trial 2). After the first two trials finished, the FIFOScheduler tries to start the third trial. However, it can't be started because its placement group is not ready. Instead, the placement group of the fourth trial is ready. Thus, we opt to run the fourth trial instead. """ def train(config): time.sleep(config["sleep"]) return 4 ray.init(num_cpus=2) tune.register_trainable("het", train) pgf1 = PlacementGroupFactory([{"CPU": 1}]) pgf2 = PlacementGroupFactory([{"CPU": 2}]) trial1 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) trial2 = Trial("het", config={"sleep": 2}, placement_group_factory=pgf1) trial3 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf2) trial4 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) runner = TrialRunner(fail_fast=True) runner.add_trial(trial1) runner.add_trial(trial2) runner.add_trial(trial3) runner.add_trial(trial4) timeout = time.monotonic() + 30 while not runner.is_finished(): # We enforce a timeout here self.assertLess(time.monotonic(), timeout, msg="Ran into a resource deadlock") runner.step()
def testPlacementGroupDistributedTraining(self, reuse_actors=False): """Run distributed training using placement groups. Each trial requests 4 CPUs and starts 4 remote training workers. """ head_bundle = {"CPU": 1, "GPU": 0, "custom": 0} child_bundle = {"CPU": 1} placement_group_factory = PlacementGroupFactory( [head_bundle, child_bundle, child_bundle, child_bundle]) @ray.remote class TrainingActor: def train(self, val): time.sleep(1) return val def train(config): base = config["base"] actors = [TrainingActor.remote() for _ in range(4)] futures = [ actor.train.remote(base + 2 * i) for i, actor in enumerate(actors) ] results = ray.get(futures) end = time.time() - config["start_time"] tune.report(avg=np.mean(results), end=end) trial_executor = RayTrialExecutor(reuse_actors=reuse_actors) start = time.time() out = tune.run( train, config={ "start_time": start, "base": tune.grid_search(list(range(0, 100, 10))) }, resources_per_trial=placement_group_factory, num_samples=1, trial_executor=trial_executor, reuse_actors=reuse_actors, verbose=2) avgs = sorted(t.last_result["avg"] for t in out.trials) self.assertSequenceEqual(avgs, list(range(3, 103, 10))) trial_end_times = sorted(t.last_result["end"] for t in out.trials) print("Trial end times:", trial_end_times) max_diff = trial_end_times[-1] - trial_end_times[0] # Not all trials have been run in parallel self.assertGreater(max_diff, 3) # Some trials should have run in parallel # Todo: Re-enable when using buildkite # self.assertLess(max_diff, 10) self._assertCleanup(trial_executor)
def default_resource_request(cls, config): head_bundle = {"CPU": 1, "GPU": 0} child_bundle = {"CPU": 1} return PlacementGroupFactory( [head_bundle, child_bundle, child_bundle], strategy=config["placement_strategy"], )
def _get_resources_from_bundles( self, bundles: List[Dict[str, float]]) -> Dict[str, float]: """Get total sums of resources in bundles""" if not bundles: return {"CPU": 0, "GPU": 0} pgf = PlacementGroupFactory(bundles) return pgf.required_resources
def default_resource_request(cls, config): cf = dict(cls.get_default_config(), **config) eval_config = cf["evaluation_config"] # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). return PlacementGroupFactory( bundles=[{ # Local worker + replay buffer actors. # Force replay buffers to be on same node to maximize # data bandwidth between buffers and the learner (driver). # Replay buffer actors each contain one shard of the total # replay buffer and use 1 CPU each. "CPU": cf["num_cpus_for_driver"] + cf["optimizer"]["num_replay_buffer_shards"], "GPU": 0 if cf["_fake_gpus"] else cf["num_gpus"], }] + [ { # RolloutWorkers. "CPU": cf["num_cpus_per_worker"], "GPU": cf["num_gpus_per_worker"], } for _ in range(cf["num_workers"]) ] + ([ { # Evaluation workers. # Note: The local eval worker is located on the driver # CPU. "CPU": eval_config.get("num_cpus_per_worker", cf["num_cpus_per_worker"]), "GPU": eval_config.get("num_gpus_per_worker", cf["num_gpus_per_worker"]), } for _ in range(cf["evaluation_num_workers"]) ] if cf["evaluation_interval"] else []), strategy=config.get("placement_strategy", "PACK"))
def testResourcesAvailableWithPlacementGroup(self): def train(config): tune.report(metric=0, resources=ray.available_resources()) head_bundle = {"CPU": 1, "GPU": 0, "custom": 4} child_bundle = {"CPU": 2, "GPU": 1, "custom": 3} placement_group_factory = PlacementGroupFactory( [head_bundle, child_bundle, child_bundle]) out = tune.run(train, resources_per_trial=placement_group_factory) available = { key: val for key, val in out.trials[0].last_result["resources"].items() if key in ["CPU", "GPU", "custom"] } if not available: self.skipTest("Warning: Ray reported no available resources, " "but this is an error on the Ray core side. " "Skipping this test for now.") self.assertDictEqual( available, { "CPU": self.head_cpus - 5.0, "GPU": self.head_gpus - 2.0, "custom": self.head_custom - 10.0, }, )
def default_resource_request(cls, config: Dict) -> PlacementGroupFactory: return PlacementGroupFactory([{}] + [{ "CPU": cls._num_cpus_per_worker, "GPU": cls._num_gpus_per_worker }] * num_workers)
def testWandbMixinConfig(self): config = {"par1": 4, "par2": 9.12345678} trial = Trial(config, 0, "trial_0", "trainable", PlacementGroupFactory([{ "CPU": 1 }])) trial_info = TrialInfo(trial) config[TRIAL_INFO] = trial_info if WANDB_ENV_VAR in os.environ: del os.environ[WANDB_ENV_VAR] # Needs at least a project with self.assertRaises(ValueError): trainable = WandbTestTrainable(config) # No API key config["wandb"] = {"project": "test_project"} with self.assertRaises(ValueError): trainable = WandbTestTrainable(config) # API Key in config config["wandb"] = {"project": "test_project", "api_key": "1234"} trainable = WandbTestTrainable(config) self.assertEqual(os.environ[WANDB_ENV_VAR], "1234") del os.environ[WANDB_ENV_VAR] # API Key file with tempfile.NamedTemporaryFile("wt") as fp: fp.write("5678") fp.flush() config["wandb"] = { "project": "test_project", "api_key_file": fp.name } trainable = WandbTestTrainable(config) self.assertEqual(os.environ[WANDB_ENV_VAR], "5678") del os.environ[WANDB_ENV_VAR] # API Key in env os.environ[WANDB_ENV_VAR] = "9012" config["wandb"] = {"project": "test_project"} trainable = WandbTestTrainable(config) # From now on, the API key is in the env variable. # Default configuration config["wandb"] = {"project": "test_project"} config[TRIAL_INFO] = trial_info trainable = WandbTestTrainable(config) self.assertEqual(trainable.wandb.kwargs["project"], "test_project") self.assertEqual(trainable.wandb.kwargs["id"], trial.trial_id) self.assertEqual(trainable.wandb.kwargs["name"], trial.trial_name) self.assertEqual(trainable.wandb.kwargs["group"], "WandbTestTrainable")
def testWandbLegacyLoggerReporting(self): trial_config = {"par1": 4, "par2": 9.12345678} trial = Trial(trial_config, 0, "trial_0", "trainable", PlacementGroupFactory([{ "CPU": 1 }])) trial_config["wandb"] = { "project": "test_project", "api_key": "1234", "excludes": ["metric2"], } logger = WandbTestLogger(trial_config, "/tmp", trial) r1 = { "metric1": 0.8, "metric2": 1.4, "metric3": np.asarray(32.0), "metric4": np.float32(32.0), "const": "text", "config": trial_config, } logger.on_result(r1) logged = logger.trial_process.logs.get(timeout=10) self.assertIn("metric1", logged) self.assertNotIn("metric2", logged) self.assertIn("metric3", logged) self.assertIn("metric4", logged) self.assertNotIn("const", logged) self.assertNotIn("config", logged) logger.close()
def testExtraResources(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "placement_group_factory": PlacementGroupFactory([{ "CPU": 1 }, { "CPU": 3, "GPU": 1 }]), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING)
def testWandbLoggerReporting(self): trial_config = {"par1": 4, "par2": 9.12345678} trial = Trial(trial_config, 0, "trial_0", "trainable", PlacementGroupFactory([{ "CPU": 1 }])) logger = WandbTestExperimentLogger(project="test_project", api_key="1234", excludes=["metric2"]) logger.on_trial_start(0, [], trial) r1 = { "metric1": 0.8, "metric2": 1.4, "metric3": np.asarray(32.0), "metric4": np.float32(32.0), "const": "text", "config": trial_config, } logger.on_trial_result(0, [], trial, r1) logged = logger.trial_processes[trial].logs.get(timeout=10) self.assertIn("metric1", logged) self.assertNotIn("metric2", logged) self.assertIn("metric3", logged) self.assertIn("metric4", logged) self.assertNotIn("const", logged) self.assertNotIn("config", logged) del logger
def testNewResources(self): sched = ResourceChangingScheduler(resources_allocation_function=( lambda a, b, c, d: PlacementGroupFactory([{ "CPU": 2 }]))) def train(config, checkpoint_dir=None): tune.report(metric=1, resources=tune.get_trial_resources()) analysis = tune.run(train, scheduler=sched, stop={"training_iteration": 2}, resources_per_trial=PlacementGroupFactory([{ "CPU": 1 }]), num_samples=1) results_list = list(analysis.results.values()) assert results_list[0]["resources"].head_cpus == 2.0
def test_placement_group_no_cpu_trainer(): """Bundles with only GPU:1 but no CPU should work""" ray.init(num_gpus=1, num_cpus=1) pgf = PlacementGroupFactory([{"GPU": 1, "CPU": 0}, {"CPU": 1}]) def train(config): time.sleep(1) return 5 tune.run(train, resources_per_trial=pgf)
def default_resource_request(cls, config): cf = dict(cls.get_default_config(), **config) # Construct a dummy LeagueBuilder, such that it gets the opportunity to # adjust the multiagent config, according to its setup, and we can then # properly infer the resources to allocate. from_config(cf["league_builder_config"], trainer=None, trainer_config=cf) max_num_policies_to_train = cf["max_num_policies_to_train"] or len( cf["multiagent"].get("policies_to_train") or cf["multiagent"]["policies"]) num_learner_shards = min(cf["num_gpus"] or max_num_policies_to_train, max_num_policies_to_train) num_gpus_per_shard = cf["num_gpus"] / num_learner_shards num_policies_per_shard = max_num_policies_to_train / num_learner_shards fake_gpus = cf["_fake_gpus"] eval_config = cf["evaluation_config"] # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). return PlacementGroupFactory( bundles=[{ # Driver (no GPUs). "CPU": cf["num_cpus_for_driver"], }] + [ { # RolloutWorkers (no GPUs). "CPU": cf["num_cpus_per_worker"], } for _ in range(cf["num_workers"]) ] + [ { # Policy learners (and Replay buffer shards). # 1 CPU for the replay buffer. # 1 CPU (or fractional GPU) for each learning policy. "CPU": 1 + (num_policies_per_shard if fake_gpus else 0), "GPU": 0 if fake_gpus else num_gpus_per_shard, } for _ in range(num_learner_shards) ] + ([ { # Evaluation (remote) workers. # Note: The local eval worker is located on the driver # CPU or not even created iff >0 eval workers. "CPU": eval_config.get("num_cpus_per_worker", cf["num_cpus_per_worker"]), } for _ in range(cf["evaluation_num_workers"]) ] if cf["evaluation_interval"] else []), strategy=config.get("placement_strategy", "PACK"), )
def main(): # __tune_begin__ from ray import tune # Set config config = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9), } # __tune_end__ # __tune_run_begin__ analysis = tune.run( train_model, config=config, metric="eval-error", mode="min", num_samples=4, resources_per_trial=PlacementGroupFactory([{ "CPU": 1.0 }] + [{ "CPU": float(num_cpus_per_actor) }] * num_actors), ) # Load in the best performing model. best_bst = load_best_model(analysis.best_logdir) # Use the following code block instead if using Ray Client. # import ray # if ray.util.client.ray.is_connected(): # # If using Ray Client best_logdir is a directory on the server. # # So we want to make sure we wrap model loading in a task. # remote_load_fn = ray.remote(load_best_model) # best_bst = ray.get(remote_load_fn.remote(analysis.best_logdir)) # Do something with the best model. _ = best_bst accuracy = 1.0 - analysis.best_result["eval-error"] print(f"Best model parameters: {analysis.best_config}") print(f"Best model total accuracy: {accuracy:.4f}")
def default_resource_request(cls, config): cf = dict(cls.get_default_config(), **config) eval_config = cf["evaluation_config"] # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). return PlacementGroupFactory( bundles=[{ # Driver + Aggregation Workers: # Force to be on same node to maximize data bandwidth # between aggregation workers and the learner (driver). # Aggregation workers tree-aggregate experiences collected # from RolloutWorkers (n rollout workers map to m # aggregation workers, where m < n) and always use 1 CPU # each. "CPU": cf["num_cpus_for_driver"] + cf["num_aggregation_workers"], "GPU": 0 if cf["_fake_gpus"] else cf["num_gpus"], }] + [ { # RolloutWorkers. "CPU": cf["num_cpus_per_worker"], "GPU": cf["num_gpus_per_worker"], **cf["custom_resources_per_worker"], } for _ in range(cf["num_workers"]) ] + ([ { # Evaluation (remote) workers. # Note: The local eval worker is located on the driver # CPU or not even created iff >0 eval workers. "CPU": eval_config.get("num_cpus_per_worker", cf["num_cpus_per_worker"]), "GPU": eval_config.get("num_gpus_per_worker", cf["num_gpus_per_worker"]), **eval_config.get( "custom_resources_per_worker", cf["custom_resources_per_worker"], ), } for _ in range(cf["evaluation_num_workers"]) ] if cf["evaluation_interval"] else []), strategy=config.get("placement_strategy", "PACK"), )
def test_default_resource_request_plus_manual_leads_to_error(self): config = DEFAULT_CONFIG.copy() config["model"]["fcnet_hiddens"] = [10] config["num_workers"] = 0 config["env"] = "CartPole-v0" try: tune.run( "PG", config=config, stop={"training_iteration": 2}, resources_per_trial=PlacementGroupFactory([{ "CPU": 1 }]), verbose=2, ) except ValueError as e: assert "have been automatically set to" in e.args[0]
def testExtraCustomResources(self): ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) # Since each trial will occupy the full custom resources, # there are at most 1 trial running at any given moment. snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory([{"CPU": 1}, {"a": 2}]), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def testExtraResources(self): ray.init(num_cpus=4, num_gpus=2) snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory( [{"CPU": 1}, {"CPU": 3, "GPU": 1}] ), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def ray_resource_allocation_function( trial_runner: "trial_runner.TrialRunner", # noqa trial: "Trial", # noqa result: Dict[str, Any], scheduler: "ResourceChangingScheduler", ): """Determine resources to allocate to running trials.""" pgf = DistributeResources(trial_runner, trial, result, scheduler) # restore original base trial resources # create bundles if scheduler.base_trial_resources.required_resources.get("GPU", 0): bundles = [{"CPU": 1, "GPU": 1}] * int(pgf.required_resources["GPU"]) else: bundles = [{"CPU": 1}] * (int(pgf.required_resources["CPU"] - 0.001)) # we can't set Trial actor's CPUs to 0 so we just go very low bundles = [{"CPU": 0.001}] + bundles pgf = PlacementGroupFactory(bundles) return pgf
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) eval_config = cf["evaluation_config"] # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). return PlacementGroupFactory( bundles=[{ # Driver + Aggregation Workers: # Force to be on same node to maximize data bandwidth # between aggregation workers and the learner (driver). # Aggregation workers tree-aggregate experiences collected # from RolloutWorkers (n rollout workers map to m # aggregation workers, where m < n) and always use 1 CPU # each. "CPU": cf["num_cpus_for_driver"] + cf["num_aggregation_workers"], "GPU": cf["num_gpus"] }] + [ { # RolloutWorkers. "CPU": cf["num_cpus_per_worker"], "GPU": cf["num_gpus_per_worker"], } for _ in range(cf["num_workers"]) ] + ([ { # Evaluation workers (+1 b/c of the additional local # worker) "CPU": eval_config.get("num_cpus_per_worker", cf["num_cpus_per_worker"]), "GPU": eval_config.get("num_gpus_per_worker", cf["num_gpus_per_worker"]), } for _ in range(cf["evaluation_num_workers"] + 1) ] if cf["evaluation_interval"] else []), strategy=config.get("placement_strategy", "PACK"))
def tune_xgboost(use_class_trainable=True): search_space = { # You can mix constants with search space objects. "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "max_depth": 9, "learning_rate": 1, "min_child_weight": tune.grid_search([2, 3]), "subsample": tune.grid_search([0.8, 0.9]), "colsample_bynode": tune.grid_search([0.8, 0.9]), "random_state": 1, "num_parallel_tree": 2000, } # This will enable aggressive early stopping of bad trials. base_scheduler = ASHAScheduler( max_t=16, grace_period=1, reduction_factor=2 # 16 training iterations ) def example_resources_allocation_function( trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler", ) -> Union[None, PlacementGroupFactory, Resources]: """This is a basic example of a resource allocating function. The function naively balances available CPUs over live trials. This function returns a new ``PlacementGroupFactory`` with updated resource requirements, or None. If the returned ``PlacementGroupFactory`` is equal by value to the one the trial has currently, the scheduler will skip the update process internally (same with None). See :class:`DistributeResources` for a more complex, robust approach. Args: trial_runner (TrialRunner): Trial runner for this Tune run. Can be used to obtain information about other trials. trial (Trial): The trial to allocate new resources to. result (Dict[str, Any]): The latest results of trial. scheduler (ResourceChangingScheduler): The scheduler calling the function. """ # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler._base_trial_resources # Don't bother if this is just the first iteration if result["training_iteration"] < 1: return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) # Assume that the number of CPUs cannot go below what was # specified in tune.run min_cpu = base_trial_resource.required_resources.get("CPU", 0) # Get the number of CPUs available in total (not just free) total_available_cpus = trial_runner.trial_executor._avail_resources.cpu # Divide the free CPUs among all live trials cpu_to_use = max( min_cpu, total_available_cpus // len(trial_runner.get_live_trials())) # Assign new CPUs to the trial in a PlacementGroupFactory return PlacementGroupFactory([{"CPU": cpu_to_use, "GPU": 0}]) # You can either define your own resources_allocation_function, or # use the default one - DistributeResources # from ray.tune.schedulers.resource_changing_scheduler import \ # DistributeResources scheduler = ResourceChangingScheduler( base_scheduler=base_scheduler, resources_allocation_function=example_resources_allocation_function # resources_allocation_function=DistributeResources() # default ) if use_class_trainable: fn = BreastCancerTrainable else: fn = train_breast_cancer analysis = tune.run( fn, metric="eval-logloss", mode="min", resources_per_trial=PlacementGroupFactory([{ "CPU": 1, "GPU": 0 }]), config=search_space, num_samples=1, scheduler=scheduler, checkpoint_at_end=use_class_trainable, ) if use_class_trainable: assert analysis.results_df["nthread"].max() > 1 return analysis
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol(dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config." ) if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" if self.search_alg_dict is not None: if TYPE not in self.search_alg_dict: logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.") search_alg = None else: search_alg_type = self.search_alg_dict[TYPE] search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator(max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`." ) else: search_alg = ConcurrencyLimiter(search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment( config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, _is_ray_backend(backend) ) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # we can't set Trial actor's CPUs to 0 so we just go very low resources_per_trial = PlacementGroupFactory( [{"CPU": 0.001}] + ([{"CPU": 1, "GPU": 1}] * self._gpu_resources_per_trial_non_none) if self._gpu_resources_per_trial_non_none else [{"CPU": 0.001}] + [{"CPU": 1}] * self._cpu_resources_per_trial_non_none ) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig(sync_to_driver=NamespacedKubernetesSyncer(self.kubernetes_namespace)) run_experiment_trial_params = tune.with_parameters(run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable(f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, max_failures=1, # retry a trial failure once resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, ) if "metric_score" in analysis.results_df.columns: ordered_trials = analysis.results_df.sort_values("metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate # tune.report call(s) but were terminated before reporting eval_stats from post-train # evaluation (e.g., trial stopped due to time budget or relatively poor performance.) # For any such trials, run model evaluation for the best model in that trial & record # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json. for trial in temp_ordered_trials: if trial["eval_stats"] == "{}" and trial["training_stats"] != "{}": # Evaluate the best model on the eval_split, which is validation_set if validation_set is not None and validation_set.size > 0: trial_path = trial["trial_dir"] best_model_path = self._get_best_model_path(trial_path, analysis) if best_model_path is not None: self._evaluate_best_model( trial, trial_path, best_model_path, validation_set, data_format, skip_save_unprocessed_output, skip_save_predictions, skip_save_eval_stats, gpus, gpu_memory_limit, allow_parallel_threads, backend, debug, ) else: logger.warning("Skipping evaluation as no model checkpoints were available") else: logger.warning("Skipping evaluation as no validation set was provided") ordered_trials = [TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials] else: logger.warning("No trials reported results; check if time budget lower than epoch latency") ordered_trials = [] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
def __call__( self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler" ) -> Union[None, PlacementGroupFactory]: # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler.base_trial_resources if not isinstance(base_trial_resource, PlacementGroupFactory): raise ValueError("evenly_distribute_cpus_gpus only supports" " PlacementGroupFactories.") # Don't bother if this is just the first iteration if result["training_iteration"] < 1: return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) # Assume that the number of CPUs and GPUs can't go below # what was specified in tune.run min_cpu = base_trial_resource.required_resources.get("CPU", 0) min_gpu = base_trial_resource.required_resources.get("GPU", 0) min_cpu_bundle = base_trial_resource.bundles[0].get("CPU", 0) min_gpu_bundle = base_trial_resource.bundles[0].get("GPU", 0) # Get the number of CPUs and GPUs avaialble in total (not just free) total_available_cpus = ( trial_runner.trial_executor._avail_resources.cpu) total_available_gpus = ( trial_runner.trial_executor._avail_resources.gpu) # Set upper limits for resources based on number of live trials # to ensure that the trial cannot get more resources that it's # possible to run num_running_trials = len(trial_runner.get_live_trials()) if min_cpu == 0: upper_cpu_limit = 0 else: upper_cpu_limit = math.ceil(total_available_cpus / num_running_trials) # Round to nearest bundle minimum # eg. 8 CPUs between 3 trials with min 2 CPUs per bundle # -> 4, 2, 2 if self.add_bundles: upper_cpu_limit = math.ceil( upper_cpu_limit / min_cpu_bundle) * min_cpu_bundle upper_cpu_limit = max(min_cpu, upper_cpu_limit) if min_gpu == 0: upper_gpu_limit = 0 else: upper_gpu_limit = math.ceil(total_available_gpus / num_running_trials) # Ensure we don't go below per-bundle minimum if self.add_bundles: upper_gpu_limit = math.ceil( upper_gpu_limit / min_cpu_bundle) * min_gpu_bundle upper_gpu_limit = max(min_gpu, upper_gpu_limit) # Function to check how many CPUs and GPUs a trial is using currently def get_used_cpus_and_gpus(t: Trial): return (t.placement_group_factory.required_resources.get("CPU", 0), t.placement_group_factory.required_resources.get("GPU", 0)) # Check how many CPUs and GPUs are currently being used by this trial trial_used_cpus, trial_used_gpus = get_used_cpus_and_gpus(trial) # Check how many CPUs and GPUs are currently being used by live trials used_cpus_and_gpus = [ get_used_cpus_and_gpus(t) for t in trial_runner.get_live_trials() ] used_cpus, used_gpus = zip(*used_cpus_and_gpus) used_cpus = sum(used_cpus) used_gpus = sum(used_gpus) # Calculate how many free CPUs and GPUs there are free_cpus = total_available_cpus - used_cpus free_gpus = total_available_gpus - used_gpus # Add free CPUs and GPUs enforcing upper and lower limits new_cpu = min(upper_cpu_limit, max(trial_used_cpus + free_cpus, min_cpu)) new_gpu = min(upper_gpu_limit, max(trial_used_gpus + free_gpus, min_gpu)) # Assign new CPUs and GPUs to the trial in a PlacementGroupFactory # If self.add_bundles, make new bundles out of the resources if self.add_bundles: if min_cpu_bundle and min_gpu_bundle: multiplier = min(new_cpu // min_cpu_bundle, new_gpu // min_cpu_bundle) elif min_gpu_bundle: multiplier = new_gpu // min_cpu_bundle else: multiplier = new_cpu // min_cpu_bundle new_bundles = [{ "CPU": min_cpu_bundle, "GPU": min_gpu_bundle }] * int(multiplier) # Otherwise, just put them all in one bundle else: new_bundles = [{"CPU": new_cpu, "GPU": new_gpu}] return PlacementGroupFactory(new_bundles)
def testPlacementGroupRequests(self, reuse_actors=False, scheduled=10): """In this test we try to start 10 trials but only have resources for 2. Placement groups should still be created and PENDING. Eventually they should be scheduled sequentially (i.e. in pairs of two).""" def train(config): time.sleep(1) now = time.time() tune.report(end=now - config["start_time"]) head_bundle = {"CPU": 4, "GPU": 0, "custom": 0} child_bundle = {"custom": 1} placement_group_factory = PlacementGroupFactory( [head_bundle, child_bundle, child_bundle]) trial_executor = RayTrialExecutor(reuse_actors=reuse_actors) this = self class _TestCallback(Callback): def on_step_end(self, iteration, trials, **info): num_finished = len([ t for t in trials if t.status == Trial.TERMINATED or t.status == Trial.ERROR ]) num_staging = sum( len(s) for s in trial_executor._pg_manager._staging.values()) num_ready = sum( len(s) for s in trial_executor._pg_manager._ready.values()) num_in_use = len(trial_executor._pg_manager._in_use_pgs) num_cached = len(trial_executor._pg_manager._cached_pgs) total_num_tracked = num_staging + num_ready + \ num_in_use + num_cached num_non_removed_pgs = len([ p for pid, p in placement_group_table().items() if p["state"] != "REMOVED" ]) num_removal_scheduled_pgs = len( trial_executor._pg_manager._pgs_for_removal) # All trials should be scheduled this.assertEqual(scheduled, min(scheduled, len(trials)), msg=f"Num trials iter {iteration}") # The number of PGs should decrease when trials finish this.assertEqual(max(scheduled, len(trials)) - num_finished, total_num_tracked, msg=f"Num tracked iter {iteration}") # The number of actual placement groups should match this this.assertEqual(max(scheduled, len(trials)) - num_finished, num_non_removed_pgs - num_removal_scheduled_pgs, msg=f"Num actual iter {iteration}") start = time.time() out = tune.run(train, config={"start_time": start}, resources_per_trial=placement_group_factory, num_samples=10, trial_executor=trial_executor, callbacks=[_TestCallback()], reuse_actors=reuse_actors, verbose=2) trial_end_times = sorted(t.last_result["end"] for t in out.trials) print("Trial end times:", trial_end_times) max_diff = trial_end_times[-1] - trial_end_times[0] # Not all trials have been run in parallel self.assertGreater(max_diff, 3) # Some trials should have run in parallel # Todo: Re-enable when using buildkite # self.assertLess(max_diff, 10) self._assertCleanup(trial_executor)
T2 = SampleBatch.concat_samples( ray.get([w.sample.remote() for w in workers])) # Improve the policy using the T1 batch policy.learn_on_batch(T1) # Do some arbitrary updates based on the T2 batch policy.update_some_value(sum(T2["rewards"])) reporter(**collect_metrics(remote_workers=workers)) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) tune.run( training_workflow, resources_per_trial=PlacementGroupFactory(([{ "CPU": 1, "GPU": 1 if args.gpu else 0 }] + [{ "CPU": 1 }] * args.num_workers)), config={ "num_workers": args.num_workers, "num_iters": args.num_iters, }, verbose=1, )
def testWandbLoggerConfig(self): trial_config = {"par1": 4, "par2": 9.12345678} trial = Trial(trial_config, 0, "trial_0", "trainable", PlacementGroupFactory([{ "CPU": 1 }])) if WANDB_ENV_VAR in os.environ: del os.environ[WANDB_ENV_VAR] # No API key with self.assertRaises(ValueError): logger = WandbTestExperimentLogger(project="test_project") logger.setup() # API Key in config logger = WandbTestExperimentLogger(project="test_project", api_key="1234") logger.setup() self.assertEqual(os.environ[WANDB_ENV_VAR], "1234") del logger del os.environ[WANDB_ENV_VAR] # API Key file with tempfile.NamedTemporaryFile("wt") as fp: fp.write("5678") fp.flush() logger = WandbTestExperimentLogger(project="test_project", api_key_file=fp.name) logger.setup() self.assertEqual(os.environ[WANDB_ENV_VAR], "5678") del logger del os.environ[WANDB_ENV_VAR] # API Key in env os.environ[WANDB_ENV_VAR] = "9012" logger = WandbTestExperimentLogger(project="test_project") logger.setup() del logger # From now on, the API key is in the env variable. logger = WandbTestExperimentLogger(project="test_project") logger.log_trial_start(trial) self.assertEqual(logger.trial_processes[trial].kwargs["project"], "test_project") self.assertEqual(logger.trial_processes[trial].kwargs["id"], trial.trial_id) self.assertEqual(logger.trial_processes[trial].kwargs["name"], trial.trial_name) self.assertEqual(logger.trial_processes[trial].kwargs["group"], trial.trainable_name) self.assertIn("config", logger.trial_processes[trial]._exclude) del logger # log config. logger = WandbTestExperimentLogger(project="test_project", log_config=True) logger.log_trial_start(trial) self.assertNotIn("config", logger.trial_processes[trial]._exclude) self.assertNotIn("metric", logger.trial_processes[trial]._exclude) del logger # Exclude metric. logger = WandbTestExperimentLogger(project="test_project", excludes=["metric"]) logger.log_trial_start(trial) self.assertIn("config", logger.trial_processes[trial]._exclude) self.assertIn("metric", logger.trial_processes[trial]._exclude) del logger