def testCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result, dispatch save runner.step() # Process save, stop trial kwargs["restore_path"] = trials[0].checkpoint.dir_or_data self.assertEqual(trials[0].status, Trial.TERMINATED) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() self.assertEqual(trials[1].status, Trial.PENDING) runner.step() # Start trial, dispatch restore self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() # Process restore self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(ray.get(trials[1].runner.get_info.remote()), 1) self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
def testTrialErrorResumeFalse(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner(resume=True, local_checkpoint_dir=self.tmpdir) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR in (t.status for t in new_runner.get_trials())
def testHasResourcesForTrialWithCaching(self): pgm = _PlacementGroupManager() pgf1 = PlacementGroupFactory([{"CPU": self.head_cpus}]) pgf2 = PlacementGroupFactory([{"CPU": self.head_cpus - 1}]) executor = RayTrialExecutor(reuse_actors=True) executor._pg_manager = pgm executor.set_max_pending_trials(1) def train(config): yield 1 yield 2 yield 3 yield 4 register_trainable("resettable", train) trial1 = Trial("resettable", placement_group_factory=pgf1) trial2 = Trial("resettable", placement_group_factory=pgf1) trial3 = Trial("resettable", placement_group_factory=pgf2) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert executor.has_resources_for_trial(trial3) executor._stage_and_update_status([trial1, trial2, trial3]) while not pgm.has_ready(trial1): time.sleep(1) executor._stage_and_update_status([trial1, trial2, trial3]) # Fill staging executor._stage_and_update_status([trial1, trial2, trial3]) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3) executor._start_trial(trial1) executor._stage_and_update_status([trial1, trial2, trial3]) executor.pause_trial( trial1) # Caches the PG and removes a PG from staging assert len(pgm._staging_futures) == 0 # This will re-schedule a placement group pgm.reconcile_placement_groups([trial1, trial2]) assert len(pgm._staging_futures) == 1 assert not pgm.can_stage() # We should still have resources for this trial as it has a cached PG assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3)
def testResourceDeadlock(self): """Tests that resource deadlock is avoided for heterogeneous PGFs. We start 4 trials in a cluster with 2 CPUs. The first two trials require 1 CPU each, the third trial 2 CPUs, the fourth trial 1 CPU. The second trial needs a bit more time to finish. This means that the resources from the first trial will be freed, and the PG of the _fourth_ trial becomes ready (not that of the third trial, because that requires 2 CPUs - however, one is still occupied by trial 2). After the first two trials finished, the FIFOScheduler tries to start the third trial. However, it can't be started because its placement group is not ready. Instead, the placement group of the fourth trial is ready. Thus, we opt to run the fourth trial instead. """ def train(config): time.sleep(config["sleep"]) return 4 ray.init(num_cpus=2) tune.register_trainable("het", train) pgf1 = PlacementGroupFactory([{"CPU": 1}]) pgf2 = PlacementGroupFactory([{"CPU": 2}]) trial1 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) trial2 = Trial("het", config={"sleep": 2}, placement_group_factory=pgf1) trial3 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf2) trial4 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) runner = TrialRunner(fail_fast=True) runner.add_trial(trial1) runner.add_trial(trial2) runner.add_trial(trial3) runner.add_trial(trial4) timeout = time.monotonic() + 30 while not runner.is_finished(): # We enforce a timeout here self.assertLess(time.monotonic(), timeout, msg="Ran into a resource deadlock") runner.step()
def testTrialNoCheckpointSave(self): """Check that non-checkpointing trials *are* saved.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial( Trial( "__fake", trial_id="non_checkpoint", stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="checkpoint", checkpoint_at_end=True, stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="pending", stopping_criterion={"training_iteration": 2}, )) old_trials = runner.get_trials() while not old_trials[2].has_reported_at_least_once: runner.step() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trials = runner2.get_trials() self.assertEqual(len(new_trials), 3) self.assertTrue( runner2.get_trial("non_checkpoint").status == Trial.TERMINATED) self.assertTrue( runner2.get_trial("checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING) self.assertTrue( runner2.get_trial("pending").has_reported_at_least_once) runner2.step()
def testNoResetTrial(self): """Tests that reset handles NotImplemented properly.""" trial = Trial("__fake") self._simulate_starting_trial(trial) exists = self.trial_executor.reset_trial(trial, {}, "modified_mock") self.assertEqual(exists, False) self.assertEqual(Trial.RUNNING, trial.status)
def testCheckpointFreqBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(trial) runner.step() # start trial runner.step() # run iteration 1-3 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 1) runner.step() # run iteration 4-6 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 6) self.assertEqual(num_checkpoints(trial), 2) runner.step() # run iteration 7-9 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 9) self.assertEqual(num_checkpoints(trial), 3)
def testStepHook(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() def on_step_begin(self, trialrunner): self._resource_updater.update_avail_resources() cnt = self.pre_step if hasattr(self, "pre_step") else 0 self.pre_step = cnt + 1 def on_step_end(self, trialrunner): cnt = self.pre_step if hasattr(self, "post_step") else 0 self.post_step = 1 + cnt import types runner.trial_executor.on_step_begin = types.MethodType( on_step_begin, runner.trial_executor) runner.trial_executor.on_step_end = types.MethodType( on_step_end, runner.trial_executor) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) runner.step() self.assertEqual(runner.trial_executor.pre_step, 1) self.assertEqual(runner.trial_executor.post_step, 1)
def testUserCheckpoint(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2})) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result, dispatch save runner.step() # Process save self.assertTrue(trials[0].has_checkpoint()) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) runner2.step() # 5: Start trial and dispatch restore trials2 = runner2.get_trials() self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
def testMultiStepRun(self): ray.init(num_cpus=4, num_gpus=2) kwargs = { "stopping_criterion": {"training_iteration": 5}, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertTrue(snapshot.all_trials_are_terminated())
def testCheckpointOverwrite(self): def count_checkpoints(cdir): return sum((fname.startswith("experiment_state") and fname.endswith(".json")) for fname in os.listdir(cdir)) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=1) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) runner.add_trial(trial) for _ in range(5): runner.step() # force checkpoint runner.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 1) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir) for _ in range(5): runner2.step() self.assertEqual(count_checkpoints(tmpdir), 2) runner2.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 2) shutil.rmtree(tmpdir)
def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): trainable_cls = get_trainable_cls("__fake") trial = Trial("__fake", stub=True) trial.config = {"some_config": 1} trial.last_result = {"some_result": 2, "config": trial.config} trainable = ray.remote(trainable_cls).remote() ray.get(trainable.set_info.remote({"info": 4})) if to_object: checkpoint_data = trainable.save_to_object.remote() else: checkpoint_data = trainable.save.remote() trial.on_checkpoint( _TrackedCheckpoint(checkpoint_data, storage_mode=CheckpointStorage.MEMORY)) trial.pickled_error_file = None trial.error_file = None result_grid = ResultGrid(None) # Internal result grid conversion result = result_grid._trial_to_result(trial) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert result.metrics_dataframe is None assert result.config == {"some_config": 1} assert result.metrics["config"] == result.config # Load checkpoint data (see ray.rllib.algorithms.mock.MockTrainer definition) with result.checkpoint.as_directory() as checkpoint_dir: with open(os.path.join(checkpoint_dir, "mock_agent.pkl"), "rb") as f: info = pickle.load(f) assert info["info"] == 4
def _testPauseAndStart(self, result_buffer_length): """Tests that unpausing works for trials being processed.""" os.environ["TUNE_RESULT_BUFFER_LENGTH"] = f"{result_buffer_length}" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" # Need a new trial executor so the ENV vars are parsed again self.trial_executor = RayTrialExecutor() base = max(result_buffer_length, 1) trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base) self.trial_executor.pause_trial(trial) self.assertEqual(Trial.PAUSED, trial.status) self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base * 2) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testSavePauseResumeErrorRestore(self): """Tests that pause checkpoint does not replace restore checkpoint.""" trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) # Save self._simulate_saving(trial) # Train self.trial_executor.continue_training(trial) self._simulate_getting_result(trial) # Pause self.trial_executor.pause_trial(trial) self.assertEqual(Trial.PAUSED, trial.status) self.assertEqual(trial.checkpoint.storage_mode, CheckpointStorage.MEMORY) # Resume self._simulate_starting_trial(trial) # Error trial.set_status(Trial.ERROR) # Restore self.trial_executor.restore(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testFailureRecoveryEnabled(self): ray.init(num_cpus=1, num_gpus=1) searchalg, scheduler = create_mock_components() runner = TrialRunner(searchalg, scheduler=scheduler) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 1, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[0].num_failures, 1) self.assertEqual(len(searchalg.errored_trials), 0) # Notice this is 1 since during recovery, the previously errored trial # is "requeued". This will call scheduler.on_trial_error. # Searcher.on_trial_error is, however, not called in this process. self.assertEqual(len(scheduler.errored_trials), 1)
def test_trial_requeue(start_connected_emptyhead_cluster, tmpdir, durable): """Removing a node in full cluster causes Trial to be requeued.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() if durable: upload_dir = "file://" + str(tmpdir) syncer_callback = SyncerCallback() else: upload_dir = None syncer_callback = custom_driver_logdir_callback(str(tmpdir)) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) # noqa kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1, "remote_checkpoint_dir": upload_dir, } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) cluster.remove_node(node) cluster.wait_for_nodes() time.sleep(0.1) # Sleep so that next step() refreshes cluster resources runner.step() # Process result, dispatch save runner.step() # Process save (detect error), requeue trial assert all(t.status == Trial.PENDING for t in trials), runner.debug_string()
def testExtraResources(self): ray.init(num_cpus=4, num_gpus=2) snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory( [{"CPU": 1}, {"CPU": 3, "GPU": 1}] ), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def testErrorHandling(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 1}, "resources": Resources(cpu=1, gpu=1), } _global_registry.register(TRAINABLE_CLASS, "asdf", None) trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.RUNNING)
def testExtraCustomResources(self): ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) # Since each trial will occupy the full custom resources, # there are at most 1 trial running at any given moment. snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory([{"CPU": 1}, {"a": 2}]), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) time.sleep(2) # Wait for stopped placement group to free resources runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def testAsyncSave(self): """Tests that saved checkpoint value not immediately set.""" trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self._simulate_saving(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testSaveRestore(self): trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self._simulate_saving(trial) self.trial_executor.restore(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def next_trial(self): spec = self._experiment.spec trial = None if self._index < spec["num_samples"]: trial = Trial(spec.get("run"), stopping_criterion=spec.get("stop")) self._index += 1 if self._index > 4: self.set_finished() return trial
def testPauseResume(self): """Tests that pausing works for trials in flight.""" trial = Trial("__fake") self._simulate_starting_trial(trial) self.trial_executor.pause_trial(trial) self.assertEqual(Trial.PAUSED, trial.status) self._simulate_starting_trial(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testCheckpointAtEndNotBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=2) trial = Trial( "__fake", checkpoint_at_end=True, stopping_criterion={"training_iteration": 4}, ) observer = TrialResultObserver() runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period=0, trial_executor=RayTrialExecutor(result_buffer_length=7), callbacks=[observer], ) runner.add_trial(trial) while not observer.just_received_a_result(): runner.step() self.assertEqual(trial.last_result[TRAINING_ITERATION], 1) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 2) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 4) while not runner.is_finished(): runner.step() self.assertEqual(num_checkpoints(trial), 1)
def testRestoreMetricsAfterCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) observer = TrialResultObserver() runner = TrialRunner(callbacks=[observer]) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) kwargs["restore_path"] = trials[0].checkpoint.dir_or_data kwargs.pop("stopping_criterion") kwargs.pop("checkpoint_freq") # No checkpointing for next trial runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() observer.reset() while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10) self.assertEqual(trials[1].last_result["iterations_since_restore"], 1) self.assertGreater(trials[1].last_result["time_since_restore"], 0) while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20) self.assertEqual(trials[1].last_result["iterations_since_restore"], 2) self.assertGreater(trials[1].last_result["time_since_restore"], 0) self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
def testTrialErrorResumeTrue(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner(resume="ERRORED_ONLY", local_checkpoint_dir=self.tmpdir) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR not in (t.status for t in new_runner.get_trials()) # The below is just a check for standard behavior. disable_error = False for t in new_runner.get_trials(): if t.config.get("mock_error"): t.config["mock_error"] = False disable_error = True assert disable_error while not new_runner.is_finished(): new_runner.step() assert Trial.ERROR not in (t.status for t in new_runner.get_trials())
def test_counting_resources(start_connected_cluster): """Tests that Tune accounting is consistent with actual cluster.""" cluster = start_connected_cluster nodes = [] assert ray.cluster_resources()["CPU"] == 1 runner = TrialRunner(BasicVariantGenerator()) kwargs = {"stopping_criterion": {"training_iteration": 10}} trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) assert ray.available_resources().get("CPU", 0) == 0 nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 2 cluster.remove_node(nodes.pop()) cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 1 runner.step() # Only 1 trial can be running due to resource limitation. assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1 for i in range(5): nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.cluster_resources()["CPU"] == 6 # This is to make sure that pg is ready for the previous pending trial, # so that when runner.step() is called next, the trial can be started in # the same event loop. time.sleep(5) runner.step() assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
def testPauseResume2(self): """Tests that pausing works for trials being processed.""" trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self.trial_executor.pause_trial(trial) self.assertEqual(Trial.PAUSED, trial.status) self._simulate_starting_trial(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testFractionalGpus(self): ray.init(num_cpus=4, num_gpus=1) runner = TrialRunner() kwargs = { "resources": Resources(cpu=1, gpu=0.5), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) for _ in range(10): runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.PENDING) self.assertEqual(trials[3].status, Trial.PENDING)