def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(address="{address}") tune.run( "PG", name="experiment", config=dict(env="CartPole-v1", framework="tf"), stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1, dict(experiment=kwargs), raise_on_failed_trial=False) """.format(address=cluster.address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath, } }, resume=True, ) assert all(t.status == Trial.TERMINATED for t in trials2) ray.shutdown() cluster.shutdown()
def test_cluster_interrupt(start_connected_cluster, tmpdir): """Tests run_experiment on cluster shutdown with actual interrupt. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) # Needs to be in scope for pytest class _Mock(tune.Trainable): """Finishes on the 4th iteration.""" def setup(self, config): self.state = {"hi": 0} def step(self): self.state["hi"] += 1 time.sleep(0.5) return {"done": self.state["hi"] >= 4} def save_checkpoint(self, path): return self.state def load_checkpoint(self, state): self.state = state # Removes indent from class. reformatted = "\n".join(line[4:] if len(line) else line for line in inspect.getsource(_Mock).split("\n")) script = """ import os import time import ray from ray import tune os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0" ray.init(address="{address}") {fail_class_code} tune.run( {fail_class}, name="experiment", stop=dict(training_iteration=5), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1, raise_on_failed_trial=False) """.format( address=cluster.address, checkpoint_dir=dirpath, fail_class_code=reformatted, fail_class=_Mock.__name__, ) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(50): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration") == 3: break time.sleep(0.2) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() Experiment.register_if_needed(_Mock) # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() assert trials[0].last_result["training_iteration"] == 3 assert trials[0].status == Trial.PENDING # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": _Mock, "local_dir": dirpath, "checkpoint_freq": 1 } }, resume=True, raise_on_failed_trial=False, ) assert all(t.status == Trial.TERMINATED for t in trials2) assert {t.trial_id for t in trials2} == {t.trial_id for t in trials} ray.shutdown() cluster.shutdown()
def test_cluster_interrupt_searcher(start_connected_cluster, tmpdir, searcher): """Tests restoration of HyperOptSearch experiment on cluster shutdown with actual interrupt. Restoration should restore both state of trials and previous search algorithm (HyperOptSearch) state. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) local_checkpoint_dir = os.path.join(dirpath, "experiment") from ray.tune import register_trainable register_trainable("trainable", MyTrainableClass) def execute_script_with_args(*args): current_dir = os.path.dirname(__file__) script = os.path.join(current_dir, "_test_cluster_interrupt_searcher.py") subprocess.Popen([sys.executable, script] + list(args)) args = ( "--ray-address", cluster.address, "--local-dir", dirpath, "--searcher", searcher, ) execute_script_with_args(*args) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. trials = [] for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() if trials and len(trials) >= 10: break time.sleep(0.5) else: raise ValueError(f"Didn't generate enough trials: {len(trials)}") if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError( f"Checkpoint file didn't appear in {local_checkpoint_dir}. " f"Current list: {os.listdir(local_checkpoint_dir)}.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() execute_script_with_args(*(args + ("--resume", ))) time.sleep(2) register_trainable("trainable", MyTrainableClass) reached = False for i in range(100): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() if len(trials) == 0: continue # nonblocking script hasn't resumed yet, wait reached = True assert len(trials) >= 10 assert len(trials) <= 20 if len(trials) == 20: break else: stop_fn = runner.trial_executor.stop_trial [stop_fn(t) for t in trials if t.status is not Trial.ERROR] time.sleep(0.5) assert reached is True ray.shutdown() cluster.shutdown()