def testSavePauseResumeErrorRestore(self): """Tests that pause checkpoint does not replace restore checkpoint.""" trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) # Save self._simulate_saving(trial) # Train self.trial_executor.continue_training(trial) self._simulate_getting_result(trial) # Pause self.trial_executor.pause_trial(trial) self.assertEqual(Trial.PAUSED, trial.status) self.assertEqual(trial.checkpoint.storage_mode, CheckpointStorage.MEMORY) # Resume self._simulate_starting_trial(trial) # Error trial.set_status(Trial.ERROR) # Restore self.trial_executor.restore(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result, dispatch save runner.step() # Process save, stop trial kwargs["restore_path"] = trials[0].checkpoint.dir_or_data self.assertEqual(trials[0].status, Trial.TERMINATED) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() self.assertEqual(trials[1].status, Trial.PENDING) runner.step() # Start trial, dispatch restore self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() # Process restore self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(ray.get(trials[1].runner.get_info.remote()), 1) self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
def testTrialErrorResumeFalse(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner(resume=True, local_checkpoint_dir=self.tmpdir) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR in (t.status for t in new_runner.get_trials())
def testHasResourcesForTrialWithCaching(self): pgm = _PlacementGroupManager() pgf1 = PlacementGroupFactory([{"CPU": self.head_cpus}]) pgf2 = PlacementGroupFactory([{"CPU": self.head_cpus - 1}]) executor = RayTrialExecutor(reuse_actors=True) executor._pg_manager = pgm executor.set_max_pending_trials(1) def train(config): yield 1 yield 2 yield 3 yield 4 register_trainable("resettable", train) trial1 = Trial("resettable", placement_group_factory=pgf1) trial2 = Trial("resettable", placement_group_factory=pgf1) trial3 = Trial("resettable", placement_group_factory=pgf2) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert executor.has_resources_for_trial(trial3) executor._stage_and_update_status([trial1, trial2, trial3]) while not pgm.has_ready(trial1): time.sleep(1) executor._stage_and_update_status([trial1, trial2, trial3]) # Fill staging executor._stage_and_update_status([trial1, trial2, trial3]) assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3) executor._start_trial(trial1) executor._stage_and_update_status([trial1, trial2, trial3]) executor.pause_trial( trial1) # Caches the PG and removes a PG from staging assert len(pgm._staging_futures) == 0 # This will re-schedule a placement group pgm.reconcile_placement_groups([trial1, trial2]) assert len(pgm._staging_futures) == 1 assert not pgm.can_stage() # We should still have resources for this trial as it has a cached PG assert executor.has_resources_for_trial(trial1) assert executor.has_resources_for_trial(trial2) assert not executor.has_resources_for_trial(trial3)
def testTrialNoCheckpointSave(self): """Check that non-checkpointing trials *are* saved.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial( Trial( "__fake", trial_id="non_checkpoint", stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="checkpoint", checkpoint_at_end=True, stopping_criterion={"training_iteration": 2}, )) while not all(t.status == Trial.TERMINATED for t in runner.get_trials()): runner.step() runner.add_trial( Trial( "__fake", trial_id="pending", stopping_criterion={"training_iteration": 2}, )) old_trials = runner.get_trials() while not old_trials[2].has_reported_at_least_once: runner.step() runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) new_trials = runner2.get_trials() self.assertEqual(len(new_trials), 3) self.assertTrue( runner2.get_trial("non_checkpoint").status == Trial.TERMINATED) self.assertTrue( runner2.get_trial("checkpoint").status == Trial.TERMINATED) self.assertTrue(runner2.get_trial("pending").status == Trial.PENDING) self.assertTrue( runner2.get_trial("pending").has_reported_at_least_once) runner2.step()
def testResourceDeadlock(self): """Tests that resource deadlock is avoided for heterogeneous PGFs. We start 4 trials in a cluster with 2 CPUs. The first two trials require 1 CPU each, the third trial 2 CPUs, the fourth trial 1 CPU. The second trial needs a bit more time to finish. This means that the resources from the first trial will be freed, and the PG of the _fourth_ trial becomes ready (not that of the third trial, because that requires 2 CPUs - however, one is still occupied by trial 2). After the first two trials finished, the FIFOScheduler tries to start the third trial. However, it can't be started because its placement group is not ready. Instead, the placement group of the fourth trial is ready. Thus, we opt to run the fourth trial instead. """ def train(config): time.sleep(config["sleep"]) return 4 ray.init(num_cpus=2) tune.register_trainable("het", train) pgf1 = PlacementGroupFactory([{"CPU": 1}]) pgf2 = PlacementGroupFactory([{"CPU": 2}]) trial1 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) trial2 = Trial("het", config={"sleep": 2}, placement_group_factory=pgf1) trial3 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf2) trial4 = Trial("het", config={"sleep": 0}, placement_group_factory=pgf1) runner = TrialRunner(fail_fast=True) runner.add_trial(trial1) runner.add_trial(trial2) runner.add_trial(trial3) runner.add_trial(trial4) timeout = time.monotonic() + 30 while not runner.is_finished(): # We enforce a timeout here self.assertLess(time.monotonic(), timeout, msg="Ran into a resource deadlock") runner.step()
def test_result_grid_future_checkpoint(ray_start_2_cpus, to_object): trainable_cls = get_trainable_cls("__fake") trial = Trial("__fake", stub=True) trial.config = {"some_config": 1} trial.last_result = {"some_result": 2, "config": trial.config} trainable = ray.remote(trainable_cls).remote() ray.get(trainable.set_info.remote({"info": 4})) if to_object: checkpoint_data = trainable.save_to_object.remote() else: checkpoint_data = trainable.save.remote() trial.on_checkpoint( _TrackedCheckpoint(checkpoint_data, storage_mode=CheckpointStorage.MEMORY)) trial.pickled_error_file = None trial.error_file = None result_grid = ResultGrid(None) # Internal result grid conversion result = result_grid._trial_to_result(trial) assert isinstance(result.checkpoint, Checkpoint) assert isinstance(result.metrics, dict) assert isinstance(result.config, dict) assert result.metrics_dataframe is None assert result.config == {"some_config": 1} assert result.metrics["config"] == result.config # Load checkpoint data (see ray.rllib.algorithms.mock.MockTrainer definition) with result.checkpoint.as_directory() as checkpoint_dir: with open(os.path.join(checkpoint_dir, "mock_agent.pkl"), "rb") as f: info = pickle.load(f) assert info["info"] == 4
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster, tmpdir, durable): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() if durable: upload_dir = "file://" + str(tmpdir) syncer_callback = SyncerCallback() else: upload_dir = None syncer_callback = custom_driver_logdir_callback(str(tmpdir)) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": upload_dir, } # Test recovery of trial that has been checkpointed t1 = Trial("__fake", **kwargs) runner.add_trial(t1) # Start trial, process result (x2), process save while not t1.has_checkpoint(): runner.step() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() # Remove checkpoint on "remote" node shutil.rmtree(os.path.dirname(t1.checkpoint.dir_or_data)) if not durable: # Recover from driver file t1.checkpoint.dir_or_data = os.path.join( tmpdir, t1.relative_logdir, os.path.relpath(t1.checkpoint.dir_or_data, t1.logdir), ) while not runner.is_finished(): runner.step() assert t1.status == Trial.TERMINATED, runner.debug_string()
def set_trial_resources( self, trial: Trial, new_resources: Union[Dict, PlacementGroupFactory] ) -> bool: """Returns True if new_resources were set.""" if new_resources: logger.info( f"Setting trial {trial} resource to {new_resources} " f"with {new_resources._bundles}" ) trial.placement_group_factory = None trial.update_resources(new_resources) # keep track of all trials which had their resources changed self._reallocated_trial_ids.add(trial.trial_id) return True return False
def _testPauseAndStart(self, result_buffer_length): """Tests that unpausing works for trials being processed.""" os.environ["TUNE_RESULT_BUFFER_LENGTH"] = f"{result_buffer_length}" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" # Need a new trial executor so the ENV vars are parsed again self.trial_executor = RayTrialExecutor() base = max(result_buffer_length, 1) trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base) self.trial_executor.pause_trial(trial) self.assertEqual(Trial.PAUSED, trial.status) self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base * 2) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def create_trial_if_possible(self, experiment_spec: Dict, output_path: str) -> Optional[Trial]: logger.debug("creating trial") trial_id = Trial.generate_id() suggested_config = self.searcher.suggest(trial_id) if suggested_config == Searcher.FINISHED: self._finished = True logger.debug("Searcher has finished.") return if suggested_config is None: return spec = copy.deepcopy(experiment_spec) spec["config"] = merge_dicts(spec["config"], copy.deepcopy(suggested_config)) # Create a new trial_id if duplicate trial is created flattened_config = resolve_nested_dict(spec["config"]) self._counter += 1 tag = "{0}_{1}".format(str(self._counter), format_vars(flattened_config)) trial = create_trial_from_spec( spec, output_path, self._parser, evaluated_params=flatten_dict(suggested_config), experiment_tag=tag, trial_id=trial_id, ) return trial
def testNoResetTrial(self): """Tests that reset handles NotImplemented properly.""" trial = Trial("__fake") self._simulate_starting_trial(trial) exists = self.trial_executor.reset_trial(trial, {}, "modified_mock") self.assertEqual(exists, False) self.assertEqual(Trial.RUNNING, trial.status)
def testMultiStepRun(self): ray.init(num_cpus=4, num_gpus=2) kwargs = { "stopping_criterion": {"training_iteration": 5}, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertTrue(snapshot.all_trials_are_terminated())
def on_trial_add(self, trial_runner: "trial_runner.TrialRunner", trial: Trial): if trial_runner.search_alg is not None and isinstance( trial_runner.search_alg, SearchGenerator): raise ValueError("Search algorithms cannot be used with {} " "schedulers. Please remove {}.".format( self.__class__.__name__, trial_runner.search_alg)) if not self._metric or not self._metric_op: raise ValueError( "{} has been instantiated without a valid `metric` ({}) or " "`mode` ({}) parameter. Either pass these parameters when " "instantiating the scheduler, or pass them as parameters " "to `tune.run()`".format(self.__class__.__name__, self._metric, self._mode)) self._trial_state[trial] = _PBTTrialState(trial) for attr in self._hyperparam_mutations.keys(): if attr not in trial.config: if log_once(attr + "-missing"): logger.debug("Cannot find {} in config. Using search " "space provided by hyperparam_mutations.") # Add attr to trial's config by sampling search space from # hyperparam_mutations. _fill_config(trial.config, attr, self._hyperparam_mutations[attr]) # Make sure this attribute is added to CLI output. trial.evaluated_params[attr] = trial.config[attr]
def testFailureRecoveryEnabled(self): ray.init(num_cpus=1, num_gpus=1) searchalg, scheduler = create_mock_components() runner = TrialRunner(searchalg, scheduler=scheduler) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 1, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[0].num_failures, 1) self.assertEqual(len(searchalg.errored_trials), 0) # Notice this is 1 since during recovery, the previously errored trial # is "requeued". This will call scheduler.on_trial_error. # Searcher.on_trial_error is, however, not called in this process. self.assertEqual(len(scheduler.errored_trials), 1)
def testUserCheckpoint(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 2})) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result self.assertFalse(trials[0].has_checkpoint()) runner.step() # Process result, dispatch save runner.step() # Process save self.assertTrue(trials[0].has_checkpoint()) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=self.tmpdir) runner2.step() # 5: Start trial and dispatch restore trials2 = runner2.get_trials() self.assertEqual(ray.get(trials2[0].runner.get_info.remote()), 1)
def testCheckpointFreqBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=3) runner = TrialRunner(local_checkpoint_dir=self.tmpdir, checkpoint_period=0) runner.add_trial(trial) runner.step() # start trial runner.step() # run iteration 1-3 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 1) runner.step() # run iteration 4-6 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 6) self.assertEqual(num_checkpoints(trial), 2) runner.step() # run iteration 7-9 runner.step() # process save self.assertEqual(trial.last_result[TRAINING_ITERATION], 9) self.assertEqual(num_checkpoints(trial), 3)
def testStepHook(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() def on_step_begin(self, trialrunner): self._resource_updater.update_avail_resources() cnt = self.pre_step if hasattr(self, "pre_step") else 0 self.pre_step = cnt + 1 def on_step_end(self, trialrunner): cnt = self.pre_step if hasattr(self, "post_step") else 0 self.post_step = 1 + cnt import types runner.trial_executor.on_step_begin = types.MethodType( on_step_begin, runner.trial_executor) runner.trial_executor.on_step_end = types.MethodType( on_step_end, runner.trial_executor) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) runner.step() self.assertEqual(runner.trial_executor.pre_step, 1) self.assertEqual(runner.trial_executor.post_step, 1)
def testCheckpointOverwrite(self): def count_checkpoints(cdir): return sum((fname.startswith("experiment_state") and fname.endswith(".json")) for fname in os.listdir(cdir)) ray.init(num_cpus=2) trial = Trial("__fake", checkpoint_freq=1) tmpdir = tempfile.mkdtemp() runner = TrialRunner(local_checkpoint_dir=tmpdir, checkpoint_period=0) runner.add_trial(trial) for _ in range(5): runner.step() # force checkpoint runner.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 1) runner2 = TrialRunner(resume="LOCAL", local_checkpoint_dir=tmpdir) for _ in range(5): runner2.step() self.assertEqual(count_checkpoints(tmpdir), 2) runner2.checkpoint() self.assertEqual(count_checkpoints(tmpdir), 2) shutil.rmtree(tmpdir)
def test_trial_requeue(start_connected_emptyhead_cluster, tmpdir, durable): """Removing a node in full cluster causes Trial to be requeued.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() if durable: upload_dir = "file://" + str(tmpdir) syncer_callback = SyncerCallback() else: upload_dir = None syncer_callback = custom_driver_logdir_callback(str(tmpdir)) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) # noqa kwargs = { "stopping_criterion": { "training_iteration": 5 }, "checkpoint_freq": 1, "max_failures": 1, "remote_checkpoint_dir": upload_dir, } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() # Start trial runner.step() # Process result, dispatch save runner.step() # Process save running_trials = _get_running_trials(runner) assert len(running_trials) == 1 assert _check_trial_running(running_trials[0]) cluster.remove_node(node) cluster.wait_for_nodes() time.sleep(0.1) # Sleep so that next step() refreshes cluster resources runner.step() # Process result, dispatch save runner.step() # Process save (detect error), requeue trial assert all(t.status == Trial.PENDING for t in trials), runner.debug_string()
def testBestTrialStr(self): """Assert that custom nested parameter columns are printed correctly""" config = { "nested": { "conf": "nested_value" }, "toplevel": "toplevel_value" } trial = Trial("", config=config, stub=True) trial.last_result = {"metric": 1, "config": config} result = best_trial_str(trial, "metric") self.assertIn("nested_value", result) result = best_trial_str(trial, "metric", parameter_columns=["nested/conf"]) self.assertIn("nested_value", result)
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) time.sleep(2) # Wait for stopped placement group to free resources runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def testErrorHandling(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 1}, "resources": Resources(cpu=1, gpu=1), } _global_registry.register(TRAINABLE_CLASS, "asdf", None) trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.RUNNING)
def testExtraCustomResources(self): ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) # Since each trial will occupy the full custom resources, # there are at most 1 trial running at any given moment. snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory([{"CPU": 1}, {"a": 2}]), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def testExtraResources(self): ray.init(num_cpus=4, num_gpus=2) snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) kwargs = { "stopping_criterion": {"training_iteration": 1}, "placement_group_factory": PlacementGroupFactory( [{"CPU": 1}, {"CPU": 3, "GPU": 1}] ), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertLess(snapshot.max_running_trials(), 2) self.assertTrue(snapshot.all_trials_are_terminated())
def next_trial(self): spec = self._experiment.spec trial = None if self._index < spec["num_samples"]: trial = Trial(spec.get("run"), stopping_criterion=spec.get("stop")) self._index += 1 if self._index > 4: self.set_finished() return trial
def testAsyncSave(self): """Tests that saved checkpoint value not immediately set.""" trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self._simulate_saving(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testSaveRestore(self): trial = Trial("__fake") self._simulate_starting_trial(trial) self._simulate_getting_result(trial) self._simulate_saving(trial) self.trial_executor.restore(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)
def testCheckpointAtEndNotBuffered(self): os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7" os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5" def num_checkpoints(trial): return sum( item.startswith("checkpoint_") for item in os.listdir(trial.logdir)) ray.init(num_cpus=2) trial = Trial( "__fake", checkpoint_at_end=True, stopping_criterion={"training_iteration": 4}, ) observer = TrialResultObserver() runner = TrialRunner( local_checkpoint_dir=self.tmpdir, checkpoint_period=0, trial_executor=RayTrialExecutor(result_buffer_length=7), callbacks=[observer], ) runner.add_trial(trial) while not observer.just_received_a_result(): runner.step() self.assertEqual(trial.last_result[TRAINING_ITERATION], 1) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 2) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 3) self.assertEqual(num_checkpoints(trial), 0) while True: runner.step() if observer.just_received_a_result(): break self.assertEqual(trial.last_result[TRAINING_ITERATION], 4) while not runner.is_finished(): runner.step() self.assertEqual(num_checkpoints(trial), 1)
def testPauseResume(self): """Tests that pausing works for trials in flight.""" trial = Trial("__fake") self._simulate_starting_trial(trial) self.trial_executor.pause_trial(trial) self.assertEqual(Trial.PAUSED, trial.status) self._simulate_starting_trial(trial) self.trial_executor.stop_trial(trial) self.assertEqual(Trial.TERMINATED, trial.status)