def run_experiments(experiments, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True): # Make sure rllib agents are registered from ray import rllib # noqa # pylint: disable=unused-import if scheduler is None: scheduler = FIFOScheduler() runner = TrialRunner( scheduler, launch_web_server=with_server, server_port=server_port) for name, spec in experiments.items(): for trial in generate_trials(spec, name): trial.set_verbose(verbose) runner.add_trial(trial) print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: print(runner.debug_string()) last_debug = time.time() print(runner.debug_string(max_debug=99999)) for trial in runner.get_trials(): # TODO(rliaw): What about errored? if trial.status != Trial.TERMINATED: raise TuneError("Trial did not complete", trial) wait_for_log_sync() return runner.get_trials()
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that has been checkpointed t1 = Trial("__fake", **kwargs) runner.add_trial(t1) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1._checkpoint.value)) runner.step() # Recovery step for i in range(3): runner.step() assert t1.status == Trial.TERMINATED
def test_trial_requeue(start_connected_emptyhead_cluster): """Removing a node in full cluster causes Trial to be requeued.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1 } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # start runner.step() # 1 result cluster.remove_node(node) cluster.wait_for_nodes() runner.step() assert all(t.status == Trial.PENDING for t in trials) with pytest.raises(TuneError): runner.step()
def test_remove_node_before_result(start_connected_emptyhead_cluster): """Tune continues when node is removed before trial returns.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } trial = Trial("__fake", **kwargs) runner.add_trial(trial) runner.step() # run 1 assert trial.status == Trial.RUNNING cluster.remove_node(node) cluster.add_node(num_cpus=1) cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 1 for i in range(3): runner.step() assert trial.status == Trial.TERMINATED with pytest.raises(TuneError): runner.step()
def test_counting_resources(start_connected_cluster): """Tests that Tune accounting is consistent with actual cluster.""" cluster = start_connected_cluster nodes = [] assert ray.global_state.cluster_resources()["CPU"] == 1 runner = TrialRunner(BasicVariantGenerator()) kwargs = {"stopping_criterion": {"training_iteration": 10}} trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # run 1 nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 2 cluster.remove_node(nodes.pop()) cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 1 runner.step() # run 2 assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 1 for i in range(5): nodes += [cluster.add_node(num_cpus=1)] cluster.wait_for_nodes() assert ray.global_state.cluster_resources()["CPU"] == 6 runner.step() # 1 result assert sum(t.status == Trial.RUNNING for t in runner.get_trials()) == 2
def test_cluster_down_simple(start_connected_cluster, tmpdir): """Tests that TrialRunner save/restore works on cluster shutdown.""" cluster = start_connected_cluster cluster.add_node(num_cpus=1) cluster.wait_for_nodes() dirpath = str(tmpdir) runner = TrialRunner( BasicVariantGenerator(), metadata_checkpoint_dir=dirpath) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "checkpoint_freq": 1, "max_failures": 1 } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # start runner.step() # start2 runner.step() # step assert all(t.status == Trial.RUNNING for t in runner.get_trials()) runner.checkpoint() cluster.shutdown() ray.shutdown() cluster = _start_new_cluster() runner = TrialRunner.restore(dirpath) runner.step() # start runner.step() # start2 for i in range(3): runner.step() with pytest.raises(TuneError): runner.step() assert all(t.status == Trial.TERMINATED for t in runner.get_trials()) cluster.shutdown()
def testFailFastRaise(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=TrialRunner.RAISE) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process result, dispatch save self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process save with self.assertRaises(Exception): runner.step() # Error
def testUserCheckpoint(self): ray.init(num_cpus=3) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2})) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result, dispatch save runner.step() # Process save self.assertTrue(trials[0].has_checkpoint()) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir) runner2.step() # 5: Start trial and dispatch restore trials2 = runner2.get_trials() self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1) shutil.rmtree(tmpdir)
def testMultiStepRun2(self): """Checks that runner.step throws when overstepping.""" ray.init(num_cpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertRaises(TuneError, runner.step)
def testExtraCustomResources(self): ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=0, extra_custom_resources={"a": 2}), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertTrue(sum(t.status == Trial.RUNNING for t in trials) < 2) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING)
def testFailureRecoveryEnabled(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 1, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[0].num_failures, 1) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING)
def testFailureRecoveryDisabled(self): ray.init(num_cpus=1, num_gpus=1) searchalg, scheduler = create_mock_components() runner = TrialRunner(searchalg, scheduler=scheduler) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 1) self.assertEqual(len(searchalg.errored_trials), 1) self.assertEqual(len(scheduler.errored_trials), 1)
def testFailFast(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=True) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) # Somehow with `fail_fast=True`, if one errors out, the others are # then stopped with `TERMINATED` status. self.assertEqual(trials[1].status, Trial.TERMINATED) self.assertRaises(TuneError, lambda: runner.step())
def testChangeResources(self): """Checks that resource requirements can be changed on fly.""" ray.init(num_cpus=2) class ChangingScheduler(FIFOScheduler): def on_trial_result(self, trial_runner, trial, result): if result["training_iteration"] == 1: executor = trial_runner.trial_executor executor.stop_trial(trial) trial.update_resources(dict(cpu=2, gpu=0)) executor.start_trial(trial) return TrialScheduler.CONTINUE runner = TrialRunner(scheduler=ChangingScheduler()) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 1) self.assertRaises( ValueError, lambda: trials[0].update_resources(dict(cpu=2, gpu=0))) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 2)
def testCheckpointAtEndNotBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_at_end=True, stopping_criterion={"training_iteration": 4}) runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period=0, trial_executor=RayTrialExecutor(result_buffer_length=7)) runner.add_trial(trial) runner.step() # start trial runner.step() # run iteration 1 self.assertEqual(trial.last_result[TRAINING_ITERATION], 1) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 2 self.assertEqual(trial.last_result[TRAINING_ITERATION], 2) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 3 self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 0) runner.step() # run iteration 4 self.assertEqual(trial.last_result[TRAINING_ITERATION], 4) self.assertEqual(num_checkpoints(trial), 1)
def test_trial_requeue(start_connected_emptyhead_cluster, trainable_id): """Removing a node in full cluster causes Trial to be requeued.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1, "remote_checkpoint_dir": MOCK_REMOTE_DIR, "sync_to_driver_fn": trainable_id == "__fake", } trials = [Trial(trainable_id, **kwargs), Trial(trainable_id, **kwargs)] for t in trials: runner.add_trial(t) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Process result, dispatch save runner.step() # Process save (detect error), requeue trial assert all( t.status == Trial.PENDING for t in trials), runner.debug_string() with pytest.raises(TuneError): runner.step()
def testUserCheckpoint(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2})) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result, dispatch save runner.step() # Process save self.assertTrue(trials[0].has_checkpoint()) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) runner2.step() # 5: Start trial and dispatch restore trials2 = runner2.get_trials() self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
def testFailFastRaise(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=TrialRunner.RAISE) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() with self.assertRaises(Exception): while not runner.is_finished(): runner.step() # Not critical checks. Only to showcase the difference # with none raise type FailFast. self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING)
def testFailureRecoveryNodeRemoval(self): # Node removal simulation only works with resource requests os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" ray.init(num_cpus=1, num_gpus=1) searchalg, scheduler = create_mock_components() runner = TrialRunner(searchalg, scheduler=scheduler) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 1, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() with patch("ray.cluster_resources") as resource_mock: resource_mock.return_value = {"CPU": 1, "GPU": 1} runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process result, dispatch save runner.step() # Process save self.assertEqual(trials[0].status, Trial.RUNNING) # Mimic a node failure resource_mock.return_value = {"CPU": 0, "GPU": 0} runner.step() # Detect node failure self.assertEqual(trials[0].status, Trial.PENDING) self.assertEqual(trials[0].num_failures, 1) self.assertEqual(len(searchalg.errored_trials), 0) self.assertEqual(len(scheduler.errored_trials), 1)
def testRestoreMetricsAfterCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) # checkpoint = runner.trial_executor.save(trials[0]) runner.step() # Process result, dispatch save runner.step() # Process save runner.trial_executor.stop_trial(trials[0]) kwargs["restore_path"] = trials[0].checkpoint.value kwargs.pop("checkpoint_freq") # No checkpointing for next trial runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial, dispatch restore self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() # Process restore runner.step() # Process result self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10) self.assertEqual(trials[1].last_result["iterations_since_restore"], 1) self.assertGreater(trials[1].last_result["time_since_restore"], 0) runner.step() # Process restore self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20) self.assertEqual(trials[1].last_result["iterations_since_restore"], 2) self.assertGreater(trials[1].last_result["time_since_restore"], 0) self.addCleanup(os.remove, trials[0].checkpoint.value)
def testTrialSaveRestore(self): """Creates different trials to test runner.checkpoint/restore.""" ray.init(num_cpus=3) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) trials = [ Trial("__fake", trial_id="trial_terminate", stopping_criterion={"training_iteration": 1}, checkpoint_freq=1) ] runner.add_trial(trials[0]) runner.step() # start runner.step() self.assertEquals(trials[0].status, Trial.TERMINATED) trials += [ Trial("__fake", trial_id="trial_fail", stopping_criterion={"training_iteration": 3}, checkpoint_freq=1, config={"mock_error": True}) ] runner.add_trial(trials[1]) runner.step() runner.step() runner.step() self.assertEquals(trials[1].status, Trial.ERROR) trials += [ Trial("__fake", trial_id="trial_succ", stopping_criterion={"training_iteration": 2}, checkpoint_freq=1) ] runner.add_trial(trials[2]) runner.step() self.assertEquals(len(runner.trial_executor.get_checkpoints()), 3) self.assertEquals(trials[2].status, Trial.RUNNING) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir) for tid in ["trial_terminate", "trial_fail"]: original_trial = runner.get_trial(tid) restored_trial = runner2.get_trial(tid) self.assertEqual(original_trial.status, restored_trial.status) restored_trial = runner2.get_trial("trial_succ") self.assertEqual(Trial.PENDING, restored_trial.status) runner2.step() runner2.step() runner2.step() self.assertRaises(TuneError, runner2.step) shutil.rmtree(tmpdir)
def testTrialNoCheckpointSave(self): """Check that non-checkpointing trials *are* saved.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial( Trial( "__fake", trial_id="non_checkpoint", stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="checkpoint", checkpoint_at_end=True, stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="pending", stopping_criterion={"training_iteration": 2}, )) old_trials = runner.get_trials() while not old_trials[2].has_reported_at_least_once: runner.step() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trials = runner2.get_trials() self.assertEqual(len(new_trials), 3) self.assertTrue( runner2.get_trial("non_checkpoint").status == Trial.TERMINATED) self.assertTrue( runner2.get_trial("checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING) self.assertTrue( runner2.get_trial("pending").has_reported_at_least_once) runner2.step()
def test_queue_trials(start_connected_emptyhead_cluster): """Tests explicit oversubscription for autoscaling. Tune oversubscribes a trial when `queue_trials=True`, but does not block other trials from running. """ os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" cluster = start_connected_emptyhead_cluster runner = TrialRunner() def create_trial(cpu, gpu=0): kwargs = { "resources": Resources(cpu=cpu, gpu=gpu), "stopping_criterion": { "training_iteration": 3 } } return Trial("__fake", **kwargs) runner.add_trial(create_trial(cpu=1)) with pytest.raises(TuneError): runner.step() # run 1 del runner executor = RayTrialExecutor(queue_trials=True) runner = TrialRunner(trial_executor=executor) cluster.add_node(num_cpus=2) cluster.wait_for_nodes() cpu_only = create_trial(cpu=1) runner.add_trial(cpu_only) runner.step() # add cpu_only trial gpu_trial = create_trial(cpu=1, gpu=1) runner.add_trial(gpu_trial) runner.step() # queue gpu_trial # This tests that the cpu_only trial should bypass the queued trial. for i in range(3): runner.step() assert cpu_only.status == Trial.TERMINATED assert gpu_trial.status == Trial.RUNNING # Scale up cluster.add_node(num_cpus=1, num_gpus=1) cluster.wait_for_nodes() for i in range(3): runner.step() assert gpu_trial.status == Trial.TERMINATED
def testTrialNoSave(self): """Check that non-checkpointing trials are not saved.""" ray.init(num_cpus=3) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) runner.add_trial( Trial( "__fake", trial_id="non_checkpoint", stopping_criterion={"training_iteration": 2})) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="checkpoint", checkpoint_at_end=True, stopping_criterion={"training_iteration": 2})) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="pending", stopping_criterion={"training_iteration": 2})) runner.step() runner.step() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir) new_trials = runner2.get_trials() self.assertEquals(len(new_trials), 3) self.assertTrue( runner2.get_trial("non_checkpoint").status == Trial.TERMINATED) self.assertTrue( runner2.get_trial("checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING) self.assertTrue(not runner2.get_trial("pending").last_result) runner2.step() shutil.rmtree(tmpdir)
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster, trainable_id): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, "sync_to_driver_fn": trainable_id == "__fake_remote", } # The following patches only affect __fake_remote. find_checkpoint_dir = TrainableUtil.find_checkpoint_dir with patch("ray.tune.logger.get_node_syncer") as mock_get_node_syncer: trainable_util = "ray.tune.ray_trial_executor.TrainableUtil" with patch(trainable_util + ".find_checkpoint_dir") as mock_find_dir: def mock_get_syncer_fn(local_dir, remote_dir, sync_function): client = mock_storage_client() return MockNodeSyncer(local_dir, remote_dir, client) mock_get_node_syncer.side_effect = mock_get_syncer_fn def mock_find_dir_fn(checkpoint_path): """Converts back to local path first.""" checkpoint_path = checkpoint_path[len(MOCK_REMOTE_DIR):] checkpoint_path = os.path.join("/", checkpoint_path) return find_checkpoint_dir(checkpoint_path) # __fake_remote trainables save to a separate "remote" directory. # TrainableUtil will not check this path unless we mock it. mock_find_dir.side_effect = mock_find_dir_fn # Test recovery of trial that has been checkpointed t1 = Trial(trainable_id, **kwargs) runner.add_trial(t1) # Start trial, process result (x2), process save for _ in range(4): runner.step() assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1.checkpoint.value)) runner.step() # Collect result 3, kick off + fail result 4 runner.step() # Dispatch restore runner.step() # Process restore + step 4 for _ in range(3): if t1.status != Trial.TERMINATED: runner.step() assert t1.status == Trial.TERMINATED, runner.debug_string()
def test_trial_migration(start_connected_emptyhead_cluster): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(resources=dict(CPU=1)) assert cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that hasn't been checkpointed t = Trial("__fake", **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result is not None node2 = cluster.add_node(resources=dict(CPU=1)) cluster.remove_node(node) assert cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(3): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial("__fake", **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(resources=dict(CPU=1)) cluster.remove_node(node2) assert cluster.wait_for_nodes() runner.step() # Recovery step assert t2.last_result["training_iteration"] == 2 for i in range(1): runner.step() assert t2.status == Trial.TERMINATED # Test recovery of trial that won't be checkpointed t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}}) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(resources=dict(CPU=1)) cluster.remove_node(node3) assert cluster.wait_for_nodes() runner.step() # Error handling step assert t3.status == Trial.ERROR with pytest.raises(TuneError): runner.step()
class TrialRunnerCallbacks(unittest.TestCase): def setUp(self): ray.init() self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() self.trial_runner = TrialRunner(trial_executor=self.executor, callbacks=[self.callback]) # experiment would never be None normally, but it's fine for testing self.trial_runner.setup_experiments(experiments=[None], total_num_samples=1) def tearDown(self): ray.shutdown() _register_all() # re-register the evicted objects if "CUDA_VISIBLE_DEVICES" in os.environ: del os.environ["CUDA_VISIBLE_DEVICES"] shutil.rmtree(self.tmpdir) def testCallbackSteps(self): trials = [ Trial("__fake", trial_id="one"), Trial("__fake", trial_id="two") ] for t in trials: self.trial_runner.add_trial(t) self.executor.next_future_result = _ExecutorEvent( event_type=_ExecutorEventType.PG_READY) self.trial_runner.step() # Trial 1 has been started self.assertEqual(self.callback.state["trial_start"]["iteration"], 0) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "one") # All these events haven't happened, yet self.assertTrue( all(k not in self.callback.state for k in [ "trial_restore", "trial_save", "trial_result", "trial_complete", "trial_fail", "experiment_end", ])) self.executor.next_future_result = _ExecutorEvent( event_type=_ExecutorEventType.PG_READY) self.trial_runner.step() # Iteration not increased yet self.assertEqual(self.callback.state["step_begin"]["iteration"], 1) # Iteration increased self.assertEqual(self.callback.state["step_end"]["iteration"], 2) # Second trial has been just started self.assertEqual(self.callback.state["trial_start"]["iteration"], 1) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "two") # Just a placeholder object ref for cp.value. cp = _TuneCheckpoint(_TuneCheckpoint.PERSISTENT, value=ray.put(1), result={TRAINING_ITERATION: 0}) trials[0].saving_to = cp # Let the first trial save a checkpoint self.executor.next_future_result = _ExecutorEvent( event_type=_ExecutorEventType.SAVING_RESULT, trial=trials[0], result={_ExecutorEvent.KEY_FUTURE_RESULT: "__checkpoint"}, ) self.trial_runner.step() self.assertEqual(self.callback.state["trial_save"]["iteration"], 2) self.assertEqual(self.callback.state["trial_save"]["trial"].trial_id, "one") # Let the second trial send a result result = {TRAINING_ITERATION: 1, "metric": 800, "done": False} self.executor.next_future_result = _ExecutorEvent( event_type=_ExecutorEventType.TRAINING_RESULT, trial=trials[1], result={"future_result": result}, ) self.assertTrue(not trials[1].has_reported_at_least_once) self.trial_runner.step() self.assertEqual(self.callback.state["trial_result"]["iteration"], 3) self.assertEqual(self.callback.state["trial_result"]["trial"].trial_id, "two") self.assertEqual( self.callback.state["trial_result"]["result"]["metric"], 800) self.assertEqual(trials[1].last_result["metric"], 800) # Let the second trial restore from a checkpoint trials[1].restoring_from = cp self.executor.next_future_result = _ExecutorEvent( event_type=_ExecutorEventType.RESTORING_RESULT, trial=trials[1]) self.trial_runner.step() self.assertEqual(self.callback.state["trial_restore"]["iteration"], 4) self.assertEqual( self.callback.state["trial_restore"]["trial"].trial_id, "two") # Let the second trial finish trials[1].restoring_from = None self.executor.next_future_result = _ExecutorEvent( event_type=_ExecutorEventType.TRAINING_RESULT, trial=trials[1], result={ _ExecutorEvent.KEY_FUTURE_RESULT: { TRAINING_ITERATION: 2, "metric": 900, "done": True, } }, ) self.trial_runner.step() self.assertEqual(self.callback.state["trial_complete"]["iteration"], 5) self.assertEqual( self.callback.state["trial_complete"]["trial"].trial_id, "two") # Let the first trial error self.executor.next_future_result = _ExecutorEvent( event_type=_ExecutorEventType.ERROR, trial=trials[0], result={_ExecutorEvent.KEY_EXCEPTION: Exception()}, ) self.trial_runner.step() self.assertEqual(self.callback.state["trial_fail"]["iteration"], 6) self.assertEqual(self.callback.state["trial_fail"]["trial"].trial_id, "one") def testCallbacksEndToEnd(self): def train(config): if config["do"] == "save": with tune.checkpoint_dir(0): pass tune.report(metric=1) elif config["do"] == "fail": raise RuntimeError("I am failing on purpose.") elif config["do"] == "delay": time.sleep(2) tune.report(metric=20) config = {"do": tune.grid_search(["save", "fail", "delay"])} tune.run(train, config=config, raise_on_failed_trial=False, callbacks=[self.callback]) self.assertIn("setup", self.callback.state) self.assertTrue(self.callback.state["setup"] is not None) keys = Experiment.PUBLIC_KEYS.copy() keys.add("total_num_samples") for key in keys: self.assertIn(key, self.callback.state["setup"]) # check if it was added first self.assertTrue(list(self.callback.state)[0] == "setup") self.assertEqual( self.callback.state["trial_fail"]["trial"].config["do"], "fail") self.assertEqual( self.callback.state["trial_save"]["trial"].config["do"], "save") self.assertEqual( self.callback.state["trial_result"]["trial"].config["do"], "delay") self.assertEqual( self.callback.state["trial_complete"]["trial"].config["do"], "delay") self.assertIn("experiment_end", self.callback.state) # check if it was added last self.assertTrue(list(self.callback.state)[-1] == "experiment_end") def testCallbackReordering(self): """SyncerCallback should come after LoggerCallback callbacks""" def get_positions(callbacks): first_logger_pos = None last_logger_pos = None syncer_pos = None for i, callback in enumerate(callbacks): if isinstance(callback, LoggerCallback): if first_logger_pos is None: first_logger_pos = i last_logger_pos = i elif isinstance(callback, SyncerCallback): syncer_pos = i return first_logger_pos, last_logger_pos, syncer_pos # Auto creation of loggers, no callbacks, no syncer callbacks = create_default_callbacks(None, SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # Auto creation of loggers with callbacks callbacks = create_default_callbacks([Callback()], SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # Auto creation of loggers with existing logger (but no CSV/JSON) callbacks = create_default_callbacks([LoggerCallback()], SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # This should throw an error as the syncer comes before the logger with self.assertRaises(ValueError): callbacks = create_default_callbacks( [SyncerCallback(None), LoggerCallback()], SyncConfig(), None) # This should be reordered but preserve the regular callback order [mc1, mc2, mc3] = [Callback(), Callback(), Callback()] # Has to be legacy logger to avoid logger callback creation lc = LegacyLoggerCallback(logger_classes=DEFAULT_LOGGERS) callbacks = create_default_callbacks([mc1, mc2, lc, mc3], SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) self.assertLess(callbacks.index(mc1), callbacks.index(mc2)) self.assertLess(callbacks.index(mc2), callbacks.index(mc3)) self.assertLess(callbacks.index(lc), callbacks.index(mc3)) # Syncer callback is appended self.assertLess(callbacks.index(mc3), syncer_pos) @patch.object(warnings, "warn") def testCallbackSetupBackwardsCompatible(self, mocked_warning_method): class NoExperimentInSetupCallback(Callback): # Old method definition didn't take in **experiment.public_spec def setup(self): return callback = NoExperimentInSetupCallback() trial_runner = TrialRunner(callbacks=[callback]) trial_runner.setup_experiments( experiments=[Experiment("", lambda x: x)], total_num_samples=1) mocked_warning_method.assert_called_once() self.assertIn("Please update", mocked_warning_method.call_args_list[0][0][0])
parser = argparse.ArgumentParser() parser.add_argument( '--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', help='Directory for storing input data') FLAGS, unparsed = parser.parse_known_args() tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) # !!! Example of using the ray.tune Python API !!! if __name__ == '__main__': runner = TrialRunner() for act in ['relu', 'elu', 'tanh']: runner.add_trial( Trial( 'mnist', 'script', stopping_criterion={ 'mean_accuracy': 0.99, 'time_total_s': 600}, config={ 'script_file_path': os.path.abspath(__file__), 'script_min_iter_time_s': 1, 'activation': act, }, experiment_tag='act={}'.format(act))) ray.init() while not runner.is_finished(): runner.step() print(runner.debug_string())
class TrialRunnerCallbacks(unittest.TestCase): def setUp(self): os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "1" ray.init() self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() self.trial_runner = TrialRunner(trial_executor=self.executor, callbacks=[self.callback]) def tearDown(self): ray.shutdown() _register_all() # re-register the evicted objects if "CUDA_VISIBLE_DEVICES" in os.environ: del os.environ["CUDA_VISIBLE_DEVICES"] shutil.rmtree(self.tmpdir) def testCallbackSteps(self): trials = [ Trial("__fake", trial_id="one"), Trial("__fake", trial_id="two") ] for t in trials: self.trial_runner.add_trial(t) self.executor.next_trial = trials[0] self.trial_runner.step() # Trial 1 has been started self.assertEqual(self.callback.state["trial_start"]["iteration"], 0) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "one") # All these events haven't happened, yet self.assertTrue( all(k not in self.callback.state for k in [ "trial_restore", "trial_save", "trial_result", "trial_complete", "trial_fail" ])) self.executor.next_trial = trials[1] self.trial_runner.step() # Iteration not increased yet self.assertEqual(self.callback.state["step_begin"]["iteration"], 1) # Iteration increased self.assertEqual(self.callback.state["step_end"]["iteration"], 2) # Second trial has been just started self.assertEqual(self.callback.state["trial_start"]["iteration"], 1) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "two") cp = Checkpoint(Checkpoint.PERSISTENT, "__checkpoint", {TRAINING_ITERATION: 0}) # Let the first trial save a checkpoint self.executor.next_trial = trials[0] trials[0].saving_to = cp self.trial_runner.step() self.assertEqual(self.callback.state["trial_save"]["iteration"], 2) self.assertEqual(self.callback.state["trial_save"]["trial"].trial_id, "one") # Let the second trial send a result result = {TRAINING_ITERATION: 1, "metric": 800, "done": False} self.executor.results[trials[1]] = result self.executor.next_trial = trials[1] self.assertEqual(trials[1].last_result, {}) self.trial_runner.step() self.assertEqual(self.callback.state["trial_result"]["iteration"], 3) self.assertEqual(self.callback.state["trial_result"]["trial"].trial_id, "two") self.assertEqual( self.callback.state["trial_result"]["result"]["metric"], 800) self.assertEqual(trials[1].last_result["metric"], 800) # Let the second trial restore from a checkpoint trials[1].restoring_from = cp self.executor.results[trials[1]] = trials[1].last_result self.trial_runner.step() self.assertEqual(self.callback.state["trial_restore"]["iteration"], 4) self.assertEqual( self.callback.state["trial_restore"]["trial"].trial_id, "two") # Let the second trial finish trials[1].restoring_from = None self.executor.results[trials[1]] = { TRAINING_ITERATION: 2, "metric": 900, "done": True } self.trial_runner.step() self.assertEqual(self.callback.state["trial_complete"]["iteration"], 5) self.assertEqual( self.callback.state["trial_complete"]["trial"].trial_id, "two") # Let the first trial error self.executor.failed_trial = trials[0] self.trial_runner.step() self.assertEqual(self.callback.state["trial_fail"]["iteration"], 6) self.assertEqual(self.callback.state["trial_fail"]["trial"].trial_id, "one") def testCallbacksEndToEnd(self): def train(config): if config["do"] == "save": with tune.checkpoint_dir(0): pass tune.report(metric=1) elif config["do"] == "fail": raise RuntimeError("I am failing on purpose.") elif config["do"] == "delay": time.sleep(2) tune.report(metric=20) config = {"do": tune.grid_search(["save", "fail", "delay"])} tune.run(train, config=config, raise_on_failed_trial=False, callbacks=[self.callback]) self.assertEqual( self.callback.state["trial_fail"]["trial"].config["do"], "fail") self.assertEqual( self.callback.state["trial_save"]["trial"].config["do"], "save") self.assertEqual( self.callback.state["trial_result"]["trial"].config["do"], "delay") self.assertEqual( self.callback.state["trial_complete"]["trial"].config["do"], "delay") def testCallbackReordering(self): """SyncerCallback should come after LoggerCallback callbacks""" def get_positions(callbacks): first_logger_pos = None last_logger_pos = None syncer_pos = None for i, callback in enumerate(callbacks): if isinstance(callback, LoggerCallback): if first_logger_pos is None: first_logger_pos = i last_logger_pos = i elif isinstance(callback, SyncerCallback): syncer_pos = i return first_logger_pos, last_logger_pos, syncer_pos # Auto creation of loggers, no callbacks, no syncer callbacks = create_default_callbacks(None, SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # Auto creation of loggers with callbacks callbacks = create_default_callbacks([Callback()], SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # Auto creation of loggers with existing logger (but no CSV/JSON) callbacks = create_default_callbacks([LoggerCallback()], SyncConfig(), None) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) # This should throw an error as the syncer comes before the logger with self.assertRaises(ValueError): callbacks = create_default_callbacks( [SyncerCallback(None), LoggerCallback()], SyncConfig(), None) # This should be reordered but preserve the regular callback order [mc1, mc2, mc3] = [Callback(), Callback(), Callback()] # Has to be legacy logger to avoid logger callback creation lc = LegacyLoggerCallback(logger_classes=DEFAULT_LOGGERS) callbacks = create_default_callbacks([mc1, mc2, lc, mc3], SyncConfig(), None) print(callbacks) first_logger_pos, last_logger_pos, syncer_pos = get_positions( callbacks) self.assertLess(last_logger_pos, syncer_pos) self.assertLess(callbacks.index(mc1), callbacks.index(mc2)) self.assertLess(callbacks.index(mc2), callbacks.index(mc3)) self.assertLess(callbacks.index(lc), callbacks.index(mc3)) # Syncer callback is appended self.assertLess(callbacks.index(mc3), syncer_pos)
def testBurnInPeriod(self): runner = TrialRunner(trial_executor=MagicMock()) scheduler = PopulationBasedTraining(time_attr="training_iteration", metric="error", mode="min", perturbation_interval=5, burn_in_period=50, log_config=True, synch=True) class MockTrial(Trial): @property def checkpoint(self): return Checkpoint(Checkpoint.MEMORY, "None", {}) @property def status(self): return Trial.PAUSED @status.setter def status(self, status): pass trial1 = MockTrial("PPO", config=dict(num=1)) trial2 = MockTrial("PPO", config=dict(num=2)) trial3 = MockTrial("PPO", config=dict(num=3)) trial4 = MockTrial("PPO", config=dict(num=4)) runner.add_trial(trial1) runner.add_trial(trial2) runner.add_trial(trial3) runner.add_trial(trial4) scheduler.on_trial_add(runner, trial1) scheduler.on_trial_add(runner, trial2) scheduler.on_trial_add(runner, trial3) scheduler.on_trial_add(runner, trial4) # Add initial results. scheduler.on_trial_result(runner, trial1, result=dict(training_iteration=1, error=50)) scheduler.on_trial_result(runner, trial2, result=dict(training_iteration=1, error=50)) scheduler.on_trial_result(runner, trial3, result=dict(training_iteration=1, error=10)) scheduler.on_trial_result(runner, trial4, result=dict(training_iteration=1, error=100)) # Add more results. Without burn-in, this would now exploit scheduler.on_trial_result(runner, trial1, result=dict(training_iteration=30, error=50)) scheduler.on_trial_result(runner, trial2, result=dict(training_iteration=30, error=50)) scheduler.on_trial_result(runner, trial3, result=dict(training_iteration=30, error=10)) scheduler.on_trial_result(runner, trial4, result=dict(training_iteration=30, error=100)) self.assertEqual(trial4.config["num"], 4) # Add more results. Since this is after burn-in, it should now exploit scheduler.on_trial_result(runner, trial1, result=dict(training_iteration=50, error=50)) scheduler.on_trial_result(runner, trial2, result=dict(training_iteration=50, error=50)) scheduler.on_trial_result(runner, trial3, result=dict(training_iteration=50, error=10)) scheduler.on_trial_result(runner, trial4, result=dict(training_iteration=50, error=100)) self.assertEqual(trial4.config["num"], 3)
help="The Redis address of the cluster.") parser.add_argument("--restore", default=None, type=str, help="If specified, restore from this checkpoint.") parser.add_argument("-f", "--config-file", default=None, type=str, help="If specified, use config options from this file.") if __name__ == "__main__": args = parser.parse_args() runner = TrialRunner() if args.config_file: with open(args.config_file) as f: config = yaml.load(f) for trial in parse_to_trials(config): runner.add_trial(trial) else: runner.add_trial( Trial( args.env, args.alg, args.config, args.local_dir, None, args.resources, args.stop, args.checkpoint_freq, args.restore, args.upload_dir)) ray.init(redis_address=args.redis_address) while not runner.is_finished(): runner.step() print(runner.debug_string()) for trial in runner.get_trials(): if trial.status != Trial.TERMINATED:
def testBurnInPeriod(self): runner = TrialRunner(trial_executor=MagicMock()) scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="error", mode="min", perturbation_interval=5, burn_in_period=50, log_config=True, synch=True, ) class MockTrial(Trial): @property def checkpoint(self): return _TrackedCheckpoint( dir_or_data={"data": "None"}, storage_mode=CheckpointStorage.MEMORY, metrics={}, ) @property def status(self): return Trial.PAUSED @status.setter def status(self, status): pass trial1 = MockTrial("PPO", config=dict(num=1)) trial2 = MockTrial("PPO", config=dict(num=2)) trial3 = MockTrial("PPO", config=dict(num=3)) trial4 = MockTrial("PPO", config=dict(num=4)) runner.add_trial(trial1) runner.add_trial(trial2) runner.add_trial(trial3) runner.add_trial(trial4) scheduler.on_trial_add(runner, trial1) scheduler.on_trial_add(runner, trial2) scheduler.on_trial_add(runner, trial3) scheduler.on_trial_add(runner, trial4) # Add initial results. scheduler.on_trial_result( runner, trial1, result=dict(training_iteration=1, error=50) ) scheduler.on_trial_result( runner, trial2, result=dict(training_iteration=1, error=50) ) scheduler.on_trial_result( runner, trial3, result=dict(training_iteration=1, error=10) ) scheduler.on_trial_result( runner, trial4, result=dict(training_iteration=1, error=100) ) # Add more results. Without burn-in, this would now exploit scheduler.on_trial_result( runner, trial1, result=dict(training_iteration=30, error=50) ) scheduler.on_trial_result( runner, trial2, result=dict(training_iteration=30, error=50) ) scheduler.on_trial_result( runner, trial3, result=dict(training_iteration=30, error=10) ) scheduler.on_trial_result( runner, trial4, result=dict(training_iteration=30, error=100) ) self.assertEqual(trial4.config["num"], 4) # Add more results. Since this is after burn-in, it should now exploit scheduler.on_trial_result( runner, trial1, result=dict(training_iteration=50, error=50) ) scheduler.on_trial_result( runner, trial2, result=dict(training_iteration=50, error=50) ) scheduler.on_trial_result( runner, trial3, result=dict(training_iteration=50, error=10) ) scheduler.on_trial_result( runner, trial4, result=dict(training_iteration=50, error=100) ) self.assertEqual(trial4.config["num"], 3) # Assert that trials do not hang after `burn_in_period` self.assertTrue(all(t.status == "PAUSED" for t in runner.get_trials())) self.assertTrue(scheduler.choose_trial_to_run(runner)) # Assert that trials do not hang when a terminated trial is added trial5 = Trial("PPO", config=dict(num=5)) runner.add_trial(trial5) scheduler.on_trial_add(runner, trial5) trial5.set_status(Trial.TERMINATED) self.assertTrue(scheduler.choose_trial_to_run(runner))
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster, trainable_id): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() class _SyncerCallback(SyncerCallback): def _create_trial_syncer(self, trial: "Trial"): client = mock_storage_client() return MockNodeSyncer(trial.logdir, trial.logdir, client) syncer_callback = _SyncerCallback(None) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } # The following patches only affect __fake_remote. def hide_remote_path(path_function): def hidden_path_func(checkpoint_path): """Converts back to local path first.""" if MOCK_REMOTE_DIR in checkpoint_path: checkpoint_path = checkpoint_path[len(MOCK_REMOTE_DIR):] checkpoint_path = os.path.join("/", checkpoint_path) return path_function(checkpoint_path) return hidden_path_func trainable_util = "ray.tune.ray_trial_executor.TrainableUtil" _find_ckpt = trainable_util + ".find_checkpoint_dir" find_func = TrainableUtil.find_checkpoint_dir _pickle_ckpt = trainable_util + ".pickle_checkpoint" pickle_func = TrainableUtil.pickle_checkpoint with patch(_find_ckpt) as mock_find, patch(_pickle_ckpt) as mock_pkl_ckpt: # __fake_remote trainables save to a separate "remote" directory. # TrainableUtil will not check this path unless we mock it. mock_find.side_effect = hide_remote_path(find_func) mock_pkl_ckpt.side_effect = hide_remote_path(pickle_func) # Test recovery of trial that has been checkpointed t1 = Trial(trainable_id, **kwargs) runner.add_trial(t1) # Start trial, process result (x2), process save for _ in range(4): runner.step() assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1.checkpoint.value)) runner.step() # Collect result 3, kick off + fail result 4 runner.step() # Dispatch restore runner.step() # Process restore + step 4 for _ in range(3): if t1.status != Trial.TERMINATED: runner.step() assert t1.status == Trial.TERMINATED, runner.debug_string()
def test_trial_migration(start_connected_emptyhead_cluster, trainable_id): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() syncer_callback = _PerTrialSyncerCallback( lambda trial: trial.trainable_name == "__fake") runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } # Test recovery of trial that hasn't been checkpointed t = Trial(trainable_id, **kwargs) runner.add_trial(t) runner.step() # Start trial runner.step() # Process result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() # TODO(ujvl): Node failure does not propagate until a step after it # actually should. This is possibly a problem with `Cluster`. runner.step() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." # Process result (x2), process save, process result (x2), process save for _ in range(6): runner.step() assert t.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that has been checkpointed t2 = Trial(trainable_id, **kwargs) runner.add_trial(t2) # Start trial, process result (x2), process save for _ in range(4): runner.step() assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # Process result 3 + start and fail 4 result runner.step() # Dispatch restore runner.step() # Process restore runner.step() # Process result 5 if t2.status != Trial.TERMINATED: runner.step() # Process result 6, dispatch save runner.step() # Process save assert t2.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that won't be checkpointed kwargs = { "stopping_criterion": { "training_iteration": 3 }, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } t3 = Trial(trainable_id, **kwargs) runner.add_trial(t3) runner.step() # Start trial runner.step() # Process result 1 cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step if t3.status != Trial.ERROR: runner.step() assert t3.status == Trial.ERROR, runner.debug_string() with pytest.raises(TuneError): runner.step()
def test_trial_migration(start_connected_emptyhead_cluster, trainable_id): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, "sync_to_driver_fn": trainable_id == "__fake", } # Test recovery of trial that hasn't been checkpointed t = Trial(trainable_id, **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(4): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial(trainable_id, **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # 3 result + start and fail 4 result runner.step() # Recovery step runner.step() # Process recovery runner.step() # result if t2.status != Trial.TERMINATED: runner.step() assert t2.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that won't be checkpointed kwargs = { "stopping_criterion": { "training_iteration": 3 }, "remote_checkpoint_dir": MOCK_REMOTE_DIR, "sync_to_driver_fn": trainable_id == "__fake", } t3 = Trial(trainable_id, **kwargs) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step if t3.status != Trial.ERROR: runner.step() assert t3.status == Trial.ERROR, runner.debug_string() with pytest.raises(TuneError): runner.step()
class TrialRunnerCallbacks(unittest.TestCase): def setUp(self): self.tmpdir = tempfile.mkdtemp() self.callback = TestCallback() self.executor = _MockTrialExecutor() self.trial_runner = TrialRunner(trial_executor=self.executor, callbacks=[self.callback]) def tearDown(self): ray.shutdown() _register_all() # re-register the evicted objects if "CUDA_VISIBLE_DEVICES" in os.environ: del os.environ["CUDA_VISIBLE_DEVICES"] shutil.rmtree(self.tmpdir) def testCallbackSteps(self): trials = [ Trial("__fake", trial_id="one"), Trial("__fake", trial_id="two") ] for t in trials: self.trial_runner.add_trial(t) self.executor.next_trial = trials[0] self.trial_runner.step() # Trial 1 has been started self.assertEqual(self.callback.state["trial_start"]["iteration"], 0) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "one") # All these events haven't happened, yet self.assertTrue( all(k not in self.callback.state for k in [ "trial_restore", "trial_save", "trial_result", "trial_complete", "trial_fail" ])) self.executor.next_trial = trials[1] self.trial_runner.step() # Iteration not increased yet self.assertEqual(self.callback.state["step_begin"]["iteration"], 1) # Iteration increased self.assertEqual(self.callback.state["step_end"]["iteration"], 2) # Second trial has been just started self.assertEqual(self.callback.state["trial_start"]["iteration"], 1) self.assertEqual(self.callback.state["trial_start"]["trial"].trial_id, "two") cp = Checkpoint(Checkpoint.PERSISTENT, "__checkpoint", {TRAINING_ITERATION: 0}) # Let the first trial save a checkpoint self.executor.next_trial = trials[0] trials[0].saving_to = cp self.trial_runner.step() self.assertEqual(self.callback.state["trial_save"]["iteration"], 2) self.assertEqual(self.callback.state["trial_save"]["trial"].trial_id, "one") # Let the second trial send a result result = {TRAINING_ITERATION: 1, "metric": 800, "done": False} self.executor.results[trials[1]] = result self.executor.next_trial = trials[1] self.assertEqual(trials[1].last_result, {}) self.trial_runner.step() self.assertEqual(self.callback.state["trial_result"]["iteration"], 3) self.assertEqual(self.callback.state["trial_result"]["trial"].trial_id, "two") self.assertEqual( self.callback.state["trial_result"]["result"]["metric"], 800) self.assertEqual(trials[1].last_result["metric"], 800) # Let the second trial restore from a checkpoint trials[1].restoring_from = cp self.executor.results[trials[1]] = trials[1].last_result self.trial_runner.step() self.assertEqual(self.callback.state["trial_restore"]["iteration"], 4) self.assertEqual( self.callback.state["trial_restore"]["trial"].trial_id, "two") # Let the second trial finish trials[1].restoring_from = None self.executor.results[trials[1]] = { TRAINING_ITERATION: 2, "metric": 900, "done": True } self.trial_runner.step() self.assertEqual(self.callback.state["trial_complete"]["iteration"], 5) self.assertEqual( self.callback.state["trial_complete"]["trial"].trial_id, "two") # Let the first trial error self.executor.failed_trial = trials[0] self.trial_runner.step() self.assertEqual(self.callback.state["trial_fail"]["iteration"], 6) self.assertEqual(self.callback.state["trial_fail"]["trial"].trial_id, "one") def testCallbacksEndToEnd(self): def train(config): if config["do"] == "save": with tune.checkpoint_dir(0): pass tune.report(metric=1) elif config["do"] == "fail": raise RuntimeError("I am failing on purpose.") elif config["do"] == "delay": time.sleep(2) tune.report(metric=20) config = {"do": tune.grid_search(["save", "fail", "delay"])} tune.run(train, config=config, raise_on_failed_trial=False, callbacks=[self.callback]) self.assertEqual( self.callback.state["trial_fail"]["trial"].config["do"], "fail") self.assertEqual( self.callback.state["trial_save"]["trial"].config["do"], "save") self.assertEqual( self.callback.state["trial_result"]["trial"].config["do"], "delay") self.assertEqual( self.callback.state["trial_complete"]["trial"].config["do"], "delay")
def test_trial_migration(start_connected_emptyhead_cluster): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that hasn't been checkpointed t = Trial("__fake", **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(3): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial("__fake", **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # Recovery step assert t2.last_result["training_iteration"] == 2 for i in range(1): runner.step() assert t2.status == Trial.TERMINATED # Test recovery of trial that won't be checkpointed t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}}) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step assert t3.status == Trial.ERROR with pytest.raises(TuneError): runner.step()