def _add_trials(self, name, spec): """Add trial by invoking TrialRunner.""" resource = {} resource["trials"] = [] trial_generator = BasicVariantGenerator() trial_generator.add_configurations({name: spec}) for trial in trial_generator.next_trials(): runner.add_trial(trial) resource["trials"].append(self._trial_info(trial)) return resource
def execute_command(self, args): def get_trial(): trial = runner.get_trial(args["trial_id"]) if trial is None: error = "Trial ({}) not found.".format(args["trial_id"]) raise TuneManagerError(error) else: return trial command = args["command"] response = {} try: if command == TuneClient.GET_LIST: response["trials"] = [ self.trial_info(t) for t in runner.get_trials() ] elif command == TuneClient.GET_TRIAL: trial = get_trial() response["trial_info"] = self.trial_info(trial) elif command == TuneClient.STOP: trial = get_trial() runner.request_stop_trial(trial) elif command == TuneClient.ADD: name = args["name"] spec = args["spec"] trial_generator = BasicVariantGenerator() trial_generator.add_configurations({name: spec}) for trial in trial_generator.next_trials(): runner.add_trial(trial) else: raise TuneManagerError("Unknown command.") status = True except TuneError as e: status = False response["message"] = str(e) return status, response
def run_experiments(experiments, search_alg=None, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True, resume=False, queue_trials=False, trial_executor=None, raise_on_failed_trial=True): """Runs and blocks until all trials finish. Args: experiments (Experiment | list | dict): Experiments to run. Will be passed to `search_alg` via `add_configurations`. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, and HyperBand. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (bool): How much output should be printed for each trial. resume (bool|"prompt"): If checkpoint exists, the experiment will resume from there. If resume is "prompt", Tune will prompt if checkpoint detected. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. Examples: >>> experiment_spec = Experiment("experiment", my_func) >>> run_experiments(experiments=experiment_spec) >>> experiment_spec = {"experiment": {"run": my_func}} >>> run_experiments(experiments=experiment_spec) >>> run_experiments( >>> experiments=experiment_spec, >>> scheduler=MedianStoppingRule(...)) >>> run_experiments( >>> experiments=experiment_spec, >>> search_alg=SearchAlgorithm(), >>> scheduler=MedianStoppingRule(...)) Returns: List of Trial objects, holding data for each executed trial. """ # This is important to do this here # because it schematize the experiments # and it conducts the implicit registration. experiments = convert_to_experiment_list(experiments) checkpoint_dir = _find_checkpoint_dir(experiments) runner = None restore = False if os.path.exists( os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)): if resume == "prompt": msg = ("Found incomplete experiment at {}. " "Would you like to resume it?".format(checkpoint_dir)) restore = click.confirm(msg, default=False) if restore: logger.info("Tip: to always resume, " "pass resume=True to run_experiments()") else: logger.info("Tip: to always start a new experiment, " "pass resume=False to run_experiments()") elif resume: restore = True else: logger.info( "Tip: to resume incomplete experiments, " "pass resume='prompt' or resume=True to run_experiments()") else: logger.info( "Did not find checkpoint file in {}.".format(checkpoint_dir)) if restore: runner = try_restore_runner(checkpoint_dir, search_alg, scheduler, trial_executor) else: logger.info("Starting a new experiment.") if not runner: if scheduler is None: scheduler = FIFOScheduler() if search_alg is None: search_alg = BasicVariantGenerator() search_alg.add_configurations(experiments) runner = TrialRunner( search_alg, scheduler=scheduler, metadata_checkpoint_dir=checkpoint_dir, launch_web_server=with_server, server_port=server_port, verbose=verbose, queue_trials=queue_trials, trial_executor=trial_executor) print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: print(runner.debug_string()) last_debug = time.time() print(runner.debug_string(max_debug=99999)) wait_for_log_sync() errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", errored_trials) else: logger.error("Trials did not complete: %s", errored_trials) return runner.get_trials()
def generate_trials(self, spec, name): suggester = BasicVariantGenerator() suggester.add_configurations({name: spec}) return suggester.next_trials()
def test_trial_migration(start_connected_emptyhead_cluster, trainable_id): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() syncer_callback = _PerTrialSyncerCallback( lambda trial: trial.trainable_name == "__fake") runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } # Test recovery of trial that hasn't been checkpointed t = Trial(trainable_id, **kwargs) runner.add_trial(t) runner.step() # Start trial runner.step() # Process result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() # TODO(ujvl): Node failure does not propagate until a step after it # actually should. This is possibly a problem with `Cluster`. runner.step() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." # Process result (x2), process save, process result (x2), process save for _ in range(6): runner.step() assert t.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that has been checkpointed t2 = Trial(trainable_id, **kwargs) runner.add_trial(t2) # Start trial, process result (x2), process save for _ in range(4): runner.step() assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # Process result 3 + start and fail 4 result runner.step() # Dispatch restore runner.step() # Process restore runner.step() # Process result 5 if t2.status != Trial.TERMINATED: runner.step() # Process result 6, dispatch save runner.step() # Process save assert t2.status == Trial.TERMINATED, runner.debug_string() # Test recovery of trial that won't be checkpointed kwargs = { "stopping_criterion": { "training_iteration": 3 }, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } t3 = Trial(trainable_id, **kwargs) runner.add_trial(t3) runner.step() # Start trial runner.step() # Process result 1 cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step if t3.status != Trial.ERROR: runner.step() assert t3.status == Trial.ERROR, runner.debug_string() with pytest.raises(TuneError): runner.step()
def test_migration_checkpoint_removal(start_connected_emptyhead_cluster, trainable_id): """Test checks that trial restarts if checkpoint is lost w/ node fail.""" cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() class _SyncerCallback(SyncerCallback): def _create_trial_syncer(self, trial: "Trial"): client = mock_storage_client() return MockNodeSyncer(trial.logdir, trial.logdir, client) syncer_callback = _SyncerCallback(None) runner = TrialRunner(BasicVariantGenerator(), callbacks=[syncer_callback]) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "checkpoint_freq": 2, "max_failures": 2, "remote_checkpoint_dir": MOCK_REMOTE_DIR, } # The following patches only affect __fake_remote. def hide_remote_path(path_function): def hidden_path_func(checkpoint_path): """Converts back to local path first.""" if MOCK_REMOTE_DIR in checkpoint_path: checkpoint_path = checkpoint_path[len(MOCK_REMOTE_DIR):] checkpoint_path = os.path.join("/", checkpoint_path) return path_function(checkpoint_path) return hidden_path_func trainable_util = "ray.tune.ray_trial_executor.TrainableUtil" _find_ckpt = trainable_util + ".find_checkpoint_dir" find_func = TrainableUtil.find_checkpoint_dir _pickle_ckpt = trainable_util + ".pickle_checkpoint" pickle_func = TrainableUtil.pickle_checkpoint with patch(_find_ckpt) as mock_find, patch(_pickle_ckpt) as mock_pkl_ckpt: # __fake_remote trainables save to a separate "remote" directory. # TrainableUtil will not check this path unless we mock it. mock_find.side_effect = hide_remote_path(find_func) mock_pkl_ckpt.side_effect = hide_remote_path(pickle_func) # Test recovery of trial that has been checkpointed t1 = Trial(trainable_id, **kwargs) runner.add_trial(t1) # Start trial, process result (x2), process save for _ in range(4): runner.step() assert t1.has_checkpoint() cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() shutil.rmtree(os.path.dirname(t1.checkpoint.value)) runner.step() # Collect result 3, kick off + fail result 4 runner.step() # Dispatch restore runner.step() # Process restore + step 4 for _ in range(3): if t1.status != Trial.TERMINATED: runner.step() assert t1.status == Trial.TERMINATED, runner.debug_string()
def get_run_params(tuning_config, param_config): params = {} if "benchmarking_params" in tuning_config: from benchmarking_trainable import BenchmarkingTrainable params["run_or_experiment"] = BenchmarkingTrainable else: from tuning_trainable import TuningTrainable from tuning_callbacks import TuningCallbacks params["run_or_experiment"] = TuningTrainable params["callbacks"] = [TuningCallbacks(tuning_config, param_config)] params["name"] = tuning_config["name"] params["num_samples"] = tuning_config["samples"] if 'tuning_timeout' in tuning_config: params["time_budget_s"] = tuning_config['tuning_timeout'] params["verbose"] = tuning_config["logging_level"] #params["stop"] = { "training_iteration": tuning_config["stages"] } params["resources_per_trial"] = { "cpu": multiprocessing.cpu_count() / tuning_config["max_concurrent_trials"], "gpu": 0 } params["mode"] = "max" params["metric"] = "score" finalize_param_config(tuning_config, param_config) params["config"] = param_config if tuning_config["use_wandb"]: from ray.tune.integration.wandb import WandbLoggerCallback callbacks = params["callbacks"] = params.get("callbacks", []) wandb_cfg_f = open(os.path.expanduser("~") + "/.netrc", "r") # last line, last word, without \n at the end wandb_api_key = wandb_cfg_f.readlines()[-1].split(' ')[-1][:-1] callbacks.append( WandbLoggerCallback(project="ralph-tuning" if "benchmarking_params" not in tuning_config else "ralph-benchmarking", api_key=wandb_api_key, group=tuning_config["name"], log_config=False)) os.environ["WANDB_START_METHOD"] = "thread" # silent=true is very important, do not delete; # otherwise, the whole tuning gets mysteriously stuck os.environ["WANDB_SILENT"] = "true" params["reuse_actors"] = True points_to_evaluate = tuning_config.get("initial_param_configs", None) max_concurrent = tuning_config["max_concurrent_trials"] if tuning_config["optimizer"] == "RANDOM": from ray.tune.suggest import BasicVariantGenerator params["search_alg"] = BasicVariantGenerator( points_to_evaluate=points_to_evaluate, max_concurrent=max_concurrent) elif tuning_config["optimizer"] == "SKOPT": from ray.tune.suggest.skopt import SkOptSearch params["search_alg"] = SkOptSearch( metric="score", mode="max", points_to_evaluate=points_to_evaluate, convert_to_python=True, max_concurrent=max_concurrent) elif tuning_config["optimizer"] == "HEBO": from ray.tune.suggest.hebo import HEBOSearch params["search_alg"] = HEBOSearch( metric="score", mode="max", points_to_evaluate=points_to_evaluate, max_concurrent=max_concurrent) return params
def run(run_or_experiment, name=None, stop=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, upload_dir=None, trial_name_creator=None, loggers=None, sync_to_cloud=None, sync_to_driver=None, checkpoint_freq=0, checkpoint_at_end=False, keep_checkpoints_num=None, checkpoint_score_attr=None, global_checkpoint_period=10, export_formats=None, max_failures=3, restore=None, search_alg=None, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=2, resume=False, queue_trials=False, reuse_actors=False, trial_executor=None, raise_on_failed_trial=True, return_trials=False, ray_auto_init=True, sync_function=None): """Executes training. Args: run_or_experiment (function|class|str|Experiment): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. name (str): Name of experiment. stop (dict): The stopping criteria. The keys may be any field in the return result of 'train()', whichever is reached first. Defaults to empty dict. config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. upload_dir (str): Optional URI to sync training results to (e.g. ``s3://bucket``). trial_name_creator (func): Optional function for generating the trial string representation. loggers (list): List of logger creators to be used with each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS. See `ray/tune/logger.py`. sync_to_cloud (func|str): Function for syncing the local_dir to and from upload_dir. If string, then it must be a string template that includes `{source}` and `{target}` for the syncer to run. If not provided, the sync command defaults to standard S3 or gsutil sync comamnds. sync_to_driver (func|str): Function for syncing trial logdir from remote node to local. If string, then it must be a string template that includes `{source}` and `{target}` for the syncer to run. If not provided, defaults to using rsync. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. global_checkpoint_period (int): Seconds between global checkpointing. This does not affect `checkpoint_freq`, which specifies frequency for individual trials. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial from its last checkpoint at least this many times. Only applies if checkpointing is enabled. Setting to -1 will lead to infinite recovery retries. Defaults to 3. restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. search_alg (SearchAlgorithm): Search Algorithm. Defaults to BasicVariantGenerator. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, and HyperBand. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and trial results. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", or bool. LOCAL/True restores the checkpoint from the local_checkpoint_dir. REMOTE restores the checkpoint from remote_checkpoint_dir. PROMPT provides CLI feedback. False forces a new experiment. If resume is set but checkpoint does not exist, ValueError will be thrown. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. ray_auto_init (bool): Automatically starts a local Ray cluster if using a RayTrialExecutor (which is the default) and if Ray is not initialized. Defaults to True. sync_function: Deprecated. See `sync_to_cloud` and `sync_to_driver`. Returns: List of Trial objects. Raises: TuneError if any trials failed and `raise_on_failed_trial` is True. Examples: >>> tune.run(mytrainable, scheduler=PopulationBasedTraining()) >>> tune.run(mytrainable, num_samples=5, reuse_actors=True) >>> tune.run( "PG", num_samples=5, config={ "env": "CartPole-v0", "lr": tune.sample_from(lambda _: np.random.rand()) } ) """ trial_executor = trial_executor or RayTrialExecutor( queue_trials=queue_trials, reuse_actors=reuse_actors, ray_auto_init=ray_auto_init) experiment = run_or_experiment if not isinstance(run_or_experiment, Experiment): run_identifier = Experiment._register_if_needed(run_or_experiment) experiment = Experiment(name=name, run=run_identifier, stop=stop, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, upload_dir=upload_dir, sync_to_driver=sync_to_driver, trial_name_creator=trial_name_creator, loggers=loggers, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore, sync_function=sync_function) else: logger.debug("Ignoring some parameters passed into tune.run.") if sync_to_cloud: assert experiment.remote_checkpoint_dir, ( "Need `upload_dir` if `sync_to_cloud` given.") runner = TrialRunner( search_alg=search_alg or BasicVariantGenerator(), scheduler=scheduler or FIFOScheduler(), local_checkpoint_dir=experiment.checkpoint_dir, remote_checkpoint_dir=experiment.remote_checkpoint_dir, sync_to_cloud=sync_to_cloud, checkpoint_period=global_checkpoint_period, resume=resume, launch_web_server=with_server, server_port=server_port, verbose=bool(verbose > 1), trial_executor=trial_executor) runner.add_experiment(experiment) if verbose: print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: if verbose: print(runner.debug_string()) last_debug = time.time() try: runner.checkpoint(force=True) except Exception: logger.exception("Trial Runner checkpointing failed.") if verbose: print(runner.debug_string(max_debug=99999)) wait_for_sync() errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", errored_trials) else: logger.error("Trials did not complete: %s", errored_trials) trials = runner.get_trials() if return_trials: return trials logger.info("Returning an analysis object by default. You can call " "`analysis.trials` to retrieve a list of trials. " "This message will be removed in future versions of Tune.") return ExperimentAnalysis(runner.checkpoint_file, trials=trials)
def __init__(self, search_alg=None, scheduler=None, local_checkpoint_dir=None, remote_checkpoint_dir=None, sync_to_cloud=None, stopper=None, resume=False, server_port=None, fail_fast=False, verbose=True, checkpoint_period=None, trial_executor=None): self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf"))) self._total_time = 0 self._iteration = 0 self._has_errored = False self._fail_fast = fail_fast if isinstance(self._fail_fast, str): self._fail_fast = self._fail_fast.upper() if self._fail_fast == TrialRunner.RAISE: logger.warning( "fail_fast='raise' detected. Be careful when using this " "mode as resources (such as Ray processes, " "file descriptors, and temporary files) may not be " "cleaned up properly. To use " "a safer mode, use fail_fast=True.") else: raise ValueError("fail_fast must be one of {bool, RAISE}. " f"Got {self._fail_fast}.") self._verbose = verbose self._server = None self._server_port = server_port if server_port is not None: self._server = TuneServer(self, self._server_port) self._trials = [] self._cached_trial_decisions = {} self._stop_queue = [] self._should_stop_experiment = False # used by TuneServer self._local_checkpoint_dir = local_checkpoint_dir if self._local_checkpoint_dir: os.makedirs(self._local_checkpoint_dir, exist_ok=True) self._remote_checkpoint_dir = remote_checkpoint_dir self._syncer = get_cloud_syncer(local_checkpoint_dir, remote_checkpoint_dir, sync_to_cloud) self._stopper = stopper or NoopStopper() self._resumed = False if self._validate_resume(resume_type=resume): errored_only = False if isinstance(resume, str): errored_only = resume.upper() == "ERRORED_ONLY" try: self.resume(run_errored_only=errored_only) self._resumed = True except Exception as e: if self._verbose: logger.error(str(e)) logger.exception("Runner restore failed.") if self._fail_fast: raise logger.info("Restarting experiment.") else: logger.debug("Starting a new experiment.") self._start_time = time.time() self._last_checkpoint_time = -float("inf") if checkpoint_period is None: checkpoint_period = env_integer("TUNE_GLOBAL_CHECKPOINT_S", 10) self._checkpoint_period = checkpoint_period self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None if self._local_checkpoint_dir: self.checkpoint_file = os.path.join( self._local_checkpoint_dir, TrialRunner.CKPT_FILE_TMPL.format(self._session_str))
def test_trial_migration(start_connected_emptyhead_cluster): """Removing a node while cluster has space should migrate trial. The trial state should also be consistent with the checkpoint. """ cluster = start_connected_emptyhead_cluster node = cluster.add_node(num_cpus=1) cluster.wait_for_nodes() runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 3 }, "checkpoint_freq": 2, "max_failures": 2 } # Test recovery of trial that hasn't been checkpointed t = Trial("__fake", **kwargs) runner.add_trial(t) runner.step() # start runner.step() # 1 result assert t.last_result node2 = cluster.add_node(num_cpus=1) cluster.remove_node(node) cluster.wait_for_nodes() runner.step() # Recovery step # TODO(rliaw): This assertion is not critical but will not pass # because checkpoint handling is messy and should be refactored # rather than hotfixed. # assert t.last_result is None, "Trial result not restored correctly." for i in range(3): runner.step() assert t.status == Trial.TERMINATED # Test recovery of trial that has been checkpointed t2 = Trial("__fake", **kwargs) runner.add_trial(t2) runner.step() # start runner.step() # 1 result runner.step() # 2 result and checkpoint assert t2.has_checkpoint() node3 = cluster.add_node(num_cpus=1) cluster.remove_node(node2) cluster.wait_for_nodes() runner.step() # Recovery step assert t2.last_result["training_iteration"] == 2 for i in range(1): runner.step() assert t2.status == Trial.TERMINATED # Test recovery of trial that won't be checkpointed t3 = Trial("__fake", **{"stopping_criterion": {"training_iteration": 3}}) runner.add_trial(t3) runner.step() # start runner.step() # 1 result cluster.add_node(num_cpus=1) cluster.remove_node(node3) cluster.wait_for_nodes() runner.step() # Error handling step assert t3.status == Trial.ERROR with pytest.raises(TuneError): runner.step()
def __init__(self, search_alg=None, scheduler=None, launch_web_server=False, local_checkpoint_dir=None, remote_checkpoint_dir=None, sync_to_cloud=None, resume=False, server_port=TuneServer.DEFAULT_PORT, verbose=True, checkpoint_period=10, trial_executor=None): """Initializes a new TrialRunner. Args: search_alg (SearchAlgorithm): SearchAlgorithm for generating Trial objects. scheduler (TrialScheduler): Defaults to FIFOScheduler. launch_web_server (bool): Flag for starting TuneServer local_checkpoint_dir (str): Path where global checkpoints are stored and restored from. remote_checkpoint_dir (str): Remote path where global checkpoints are stored and restored from. Used if `resume` == REMOTE. resume (str|False): see `tune.py:run`. sync_to_cloud (func|str): see `tune.py:run`. server_port (int): Port number for launching TuneServer. verbose (bool): Flag for verbosity. If False, trial results will not be output. trial_executor (TrialExecutor): Defaults to RayTrialExecutor. """ self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf"))) self._total_time = 0 self._iteration = 0 self._verbose = verbose self._server = None self._server_port = server_port if launch_web_server: self._server = TuneServer(self, self._server_port) self._trials = [] self._stop_queue = [] self._local_checkpoint_dir = local_checkpoint_dir if self._local_checkpoint_dir and not os.path.exists( self._local_checkpoint_dir): os.makedirs(self._local_checkpoint_dir) self._remote_checkpoint_dir = remote_checkpoint_dir self._syncer = get_syncer(local_checkpoint_dir, remote_checkpoint_dir, sync_to_cloud) self._resumed = False if self._validate_resume(resume_type=resume): try: self.resume() logger.info("Resuming trial.") self._resumed = True except Exception: logger.exception( "Runner restore failed. Restarting experiment.") else: logger.debug("Starting a new experiment.") self._start_time = time.time() self._last_checkpoint_time = -float("inf") self._checkpoint_period = checkpoint_period self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None if self._local_checkpoint_dir: self.checkpoint_file = os.path.join( self._local_checkpoint_dir, TrialRunner.CKPT_FILE_TMPL.format(self._session_str))
def __init__(self, search_alg=None, scheduler=None, local_checkpoint_dir=None, remote_checkpoint_dir=None, sync_to_cloud=None, stopper=None, resume=False, server_port=None, fail_fast=False, checkpoint_period=None, trial_executor=None, callbacks=None, metric=None): self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() self._pending_trial_queue_times = {} # Set the number of maximum pending trials max_pending_trials = os.getenv("TUNE_MAX_PENDING_TRIALS_PG", "auto") if max_pending_trials == "auto": # Auto detect if isinstance(self._search_alg, BasicVariantGenerator): self._max_pending_trials = 1000 else: self._max_pending_trials = 1 else: # Manual override self._max_pending_trials = int(max_pending_trials) self.trial_executor.set_max_pending_trials(self._max_pending_trials) self._metric = metric if "TRIALRUNNER_WALLTIME_LIMIT" in os.environ: raise ValueError( "The TRIALRUNNER_WALLTIME_LIMIT environment variable is " "deprecated. " "Use `tune.run(time_budget_s=limit)` instead.") self._total_time = 0 self._iteration = 0 self._has_errored = False self._fail_fast = fail_fast if isinstance(self._fail_fast, str): self._fail_fast = self._fail_fast.upper() if self._fail_fast == TrialRunner.RAISE: warnings.warn( "fail_fast='raise' detected. Be careful when using this " "mode as resources (such as Ray processes, " "file descriptors, and temporary files) may not be " "cleaned up properly. To use " "a safer mode, use fail_fast=True.") else: raise ValueError("fail_fast must be one of {bool, RAISE}. " f"Got {self._fail_fast}.") self._server = None self._server_port = server_port if server_port is not None: self._server = TuneServer(self, self._server_port) self._trials = [] self._cached_trial_decisions = {} self._queued_trial_decisions = {} self._updated_queue = False self._stop_queue = [] self._should_stop_experiment = False # used by TuneServer self._local_checkpoint_dir = local_checkpoint_dir if self._local_checkpoint_dir: os.makedirs(self._local_checkpoint_dir, exist_ok=True) self._remote_checkpoint_dir = remote_checkpoint_dir self._syncer = get_cloud_syncer(local_checkpoint_dir, remote_checkpoint_dir, sync_to_cloud) self._stopper = stopper or NoopStopper() self._resumed = False if self._validate_resume(resume_type=resume): errored_only = False if isinstance(resume, str): errored_only = resume.upper() == "ERRORED_ONLY" try: self.resume(run_errored_only=errored_only) self._resumed = True except Exception as e: if has_verbosity(Verbosity.V3_TRIAL_DETAILS): logger.error(str(e)) logger.exception("Runner restore failed.") if self._fail_fast: raise logger.info("Restarting experiment.") else: logger.debug("Starting a new experiment.") self._start_time = time.time() self._last_checkpoint_time = -float("inf") self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None if self._local_checkpoint_dir: self.checkpoint_file = os.path.join( self._local_checkpoint_dir, TrialRunner.CKPT_FILE_TMPL.format(self._session_str)) self._callbacks = CallbackList(callbacks or []) self._callbacks.setup() if checkpoint_period is None: checkpoint_period = os.getenv("TUNE_GLOBAL_CHECKPOINT_S", "auto") self._checkpoint_period = checkpoint_period self._checkpoint_manager = self._create_checkpoint_manager()
def _tune_run(self, config, resources_per_trial): """Wrapper to call ``tune.run``. Multiple estimators are generated when early stopping is possible, whereas a single estimator is generated when early stopping is not possible. Args: config (dict): Configurations such as hyperparameters to run ``tune.run`` on. resources_per_trial (dict): Resources to use per trial within Ray. Accepted keys are `cpu`, `gpu` and custom resources, and values are integers specifying the number of each resource to use. Returns: analysis (`ExperimentAnalysis`): Object returned by `tune.run`. """ if self.seed is not None: random.seed(self.seed) np.random.seed(self.seed) trainable = _Trainable if self.pipeline_auto_early_stop and check_is_pipeline( self.estimator) and self.early_stopping: trainable = _PipelineTrainable max_iter = self.max_iters if self.early_stopping is not None: config["estimator_list"] = [ clone(self.estimator) for _ in range(self.n_splits) ] if hasattr(self.early_stopping, "_max_t_attr"): # we want to delegate stopping to schedulers which # support it, but we want it to stop eventually, just in case # the solution is to make the stop condition very big max_iter = self.max_iters * 10 else: config["estimator_list"] = [self.estimator] stopper = MaximumIterationStopper(max_iter=max_iter) if self.stopper: stopper = CombinedStopper(stopper, self.stopper) run_args = dict(scheduler=self.early_stopping, reuse_actors=True, verbose=self.verbose, stop=stopper, num_samples=self.n_trials, config=config, fail_fast="raise", resources_per_trial=resources_per_trial, local_dir=os.path.expanduser(self.local_dir), loggers=self.loggers, time_budget_s=self.time_budget_s) if self.search_optimization == "random": if isinstance(self.param_distributions, list): search_algo = RandomListSearcher(self.param_distributions) else: search_algo = BasicVariantGenerator() run_args["search_alg"] = search_algo else: search_space = None override_search_space = True if self._is_param_distributions_all_tune_domains(): run_args["config"].update(self.param_distributions) override_search_space = False search_kwargs = self.search_kwargs.copy() search_kwargs.update(metric=self._metric_name, mode="max") if self.search_optimization == "bayesian": from ray.tune.suggest.skopt import SkOptSearch if override_search_space: search_space = self.param_distributions search_algo = SkOptSearch(space=search_space, **search_kwargs) run_args["search_alg"] = search_algo elif self.search_optimization == "bohb": from ray.tune.suggest.bohb import TuneBOHB if override_search_space: search_space = self._get_bohb_config_space() if self.seed: warnings.warn("'seed' is not implemented for BOHB.") search_algo = TuneBOHB(space=search_space, **search_kwargs) # search_algo = TuneBOHB( # space=search_space, seed=self.seed, **search_kwargs) run_args["search_alg"] = search_algo elif self.search_optimization == "optuna": from ray.tune.suggest.optuna import OptunaSearch from optuna.samplers import TPESampler sampler = TPESampler(seed=self.seed) if override_search_space: search_space = self._get_optuna_params() search_algo = OptunaSearch(space=search_space, sampler=sampler, **search_kwargs) run_args["search_alg"] = search_algo elif self.search_optimization == "hyperopt": from ray.tune.suggest.hyperopt import HyperOptSearch if override_search_space: search_space = self._get_hyperopt_params() search_algo = HyperOptSearch(space=search_space, random_state_seed=self.seed, **search_kwargs) run_args["search_alg"] = search_algo else: # This should not happen as we validate the input before # this method. Still, just to be sure, raise an error here. raise ValueError( f"Invalid search optimizer: {self.search_optimization}") if isinstance(self.n_jobs, int) and self.n_jobs > 0 \ and not self.search_optimization == "random": search_algo = ConcurrencyLimiter(search_algo, max_concurrent=self.n_jobs) run_args["search_alg"] = search_algo with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="fail_fast='raise' " "detected.") analysis = tune.run(trainable, **run_args) return analysis
def run( run_or_experiment, name=None, metric=None, mode=None, stop=None, time_budget_s=None, config=None, resources_per_trial=None, num_samples=1, local_dir=None, search_alg=None, scheduler=None, keep_checkpoints_num=None, checkpoint_score_attr=None, checkpoint_freq=0, checkpoint_at_end=False, verbose=Verbosity.V3_TRIAL_DETAILS, progress_reporter=None, log_to_file=False, trial_name_creator=None, trial_dirname_creator=None, sync_config=None, export_formats=None, max_failures=0, fail_fast=False, restore=None, server_port=None, resume=False, queue_trials=False, reuse_actors=False, trial_executor=None, raise_on_failed_trial=True, callbacks=None, # Deprecated args loggers=None, ray_auto_init=None, run_errored_only=None, global_checkpoint_period=None, with_server=None, upload_dir=None, sync_to_cloud=None, sync_to_driver=None, sync_on_checkpoint=None, ): """Executes training. Examples: .. code-block:: python # Run 10 trials (each trial is one instance of a Trainable). Tune runs # in parallel and automatically determines concurrency. tune.run(trainable, num_samples=10) # Run 1 trial, stop when trial has reached 10 iterations tune.run(my_trainable, stop={"training_iteration": 10}) # automatically retry failed trials up to 3 times tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3) # Run 1 trial, search over hyperparameters, stop after 10 iterations. space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)} tune.run(my_trainable, config=space, stop={"training_iteration": 10}) # Resumes training if a previous machine crashed tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume=True) # Rerun ONLY failed trials after an experiment is finished. tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume="ERRORED_ONLY") Args: run_or_experiment (function | class | str | :class:`Experiment`): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. If you want to pass in a Python lambda, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``tune.run("lambda_id")``. metric (str): Metric to optimize. This metric should be reported with `tune.report()`. If set, will be passed to the search algorithm and scheduler. mode (str): Must be one of [min, max]. Determines whether objective is minimizing or maximizing the metric attribute. If set, will be passed to the search algorithm and scheduler. name (str): Name of experiment. stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict, the keys may be any field in the return result of 'train()', whichever is reached first. If function, it must take (trial_id, result) as arguments and return a boolean (True if trial should be stopped, False otherwise). This can also be a subclass of ``ray.tune.Stopper``, which allows users to implement custom experiment-wide stopping (i.e., stopping an entire Tune run based on some time constraint). time_budget_s (int|float|datetime.timedelta): Global time budget in seconds after which all trials are stopped. Can also be a ``datetime.timedelta`` object. config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. If this is -1, (virtually) infinite samples are generated until a stopping condition is met. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. search_alg (Searcher|SearchAlgorithm): Search algorithm for optimization. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to ray.tune.schedulers for more options. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. This has no effect when using the Functional Training API. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. This has no effect when using the Functional Training API. verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief trial results, 3 = status and detailed trial results. Defaults to 3. progress_reporter (ProgressReporter): Progress reporter for reporting intermediate experiment progress. Defaults to CLIReporter if running in command-line, or JupyterNotebookReporter if running in a Jupyter notebook. log_to_file (bool|str|Sequence): Log stdout and stderr to files in Tune's trial directories. If this is `False` (default), no files are written. If `true`, outputs are written to `trialdir/stdout` and `trialdir/stderr`, respectively. If this is a single string, this is interpreted as a file relative to the trialdir, to which both streams are written. If this is a Sequence (e.g. a Tuple), it has to have length 2 and the elements indicate the files to which stdout and stderr are written, respectively. trial_name_creator (Callable[[Trial], str]): Optional function for generating the trial string representation. trial_dirname_creator (Callable[[Trial], str]): Function for generating the trial dirname. This function should take in a Trial object and return a string representing the name of the directory. The return value cannot be a path. sync_config (SyncConfig): Configuration object for syncing. See tune.SyncConfig. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial at least this many times. Ray will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 0. fail_fast (bool | str): Whether to fail upon the first error. If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution (it is best used with `ray.init(local_mode=True)`). restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. server_port (int): Port number for launching TuneServer. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", or bool. LOCAL/True restores the checkpoint from the local_checkpoint_dir, determined by `name` and `local_dir`. REMOTE restores the checkpoint from remote_checkpoint_dir. PROMPT provides CLI feedback. False forces a new experiment. ERRORED_ONLY resets and reruns ERRORED trials upon resume - previous trial artifacts will be left untouched. If resume is set but checkpoint does not exist, ValueError will be thrown. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. callbacks (list): List of callbacks that will be called at different times in the training loop. Must be instances of the ``ray.tune.callback.Callback`` class. If not passed, `LoggerCallback` and `SyncerCallback` callbacks are automatically added. Returns: ExperimentAnalysis: Object for experiment analysis. Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ all_start = time.time() if global_checkpoint_period: raise ValueError("global_checkpoint_period is deprecated. Set env var " "'TUNE_GLOBAL_CHECKPOINT_S' instead.") if ray_auto_init: raise ValueError("ray_auto_init is deprecated. " "Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or " "call 'ray.init' before calling 'tune.run'.") if with_server: raise ValueError( "with_server is deprecated. It is now enabled by default " "if 'server_port' is not None.") if sync_on_checkpoint or sync_to_cloud or sync_to_driver or upload_dir: raise ValueError( "sync_on_checkpoint / sync_to_cloud / sync_to_driver / " "upload_dir must now be set via `tune.run(" "sync_config=SyncConfig(...)`. See `ray.tune.SyncConfig` for " "more details.") if mode and mode not in ["min", "max"]: raise ValueError( "The `mode` parameter passed to `tune.run()` has to be one of " "['min', 'max']") set_verbosity(verbose) config = config or {} sync_config = sync_config or SyncConfig() set_sync_periods(sync_config) if num_samples == -1: num_samples = sys.maxsize trial_executor = trial_executor or RayTrialExecutor( reuse_actors=reuse_actors, queue_trials=queue_trials) if isinstance(run_or_experiment, list): experiments = run_or_experiment else: experiments = [run_or_experiment] for i, exp in enumerate(experiments): if not isinstance(exp, Experiment): experiments[i] = Experiment( name=name, run=exp, stop=stop, time_budget_s=time_budget_s, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, upload_dir=sync_config.upload_dir, sync_to_driver=sync_config.sync_to_driver, trial_name_creator=trial_name_creator, trial_dirname_creator=trial_dirname_creator, log_to_file=log_to_file, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, sync_on_checkpoint=sync_config.sync_on_checkpoint, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore) else: logger.debug("Ignoring some parameters passed into tune.run.") if sync_config.sync_to_cloud: for exp in experiments: assert exp.remote_checkpoint_dir, ( "Need `upload_dir` if `sync_to_cloud` given.") if fail_fast and max_failures != 0: raise ValueError("max_failures must be 0 if fail_fast=True.") if issubclass(type(search_alg), Searcher): search_alg = SearchGenerator(search_alg) if not search_alg: search_alg = BasicVariantGenerator() if config and not search_alg.set_search_properties(metric, mode, config): if has_unresolved_values(config): raise ValueError( "You passed a `config` parameter to `tune.run()` with " "unresolved parameters, but the search algorithm was already " "instantiated with a search space. Make sure that `config` " "does not contain any more parameter definitions - include " "them in the search algorithm's search space if necessary.") scheduler = scheduler or FIFOScheduler() if not scheduler.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the scheduler you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your scheduler or from your call to `tune.run()`") # Create syncer callbacks callbacks = create_default_callbacks(callbacks, sync_config, metric=metric, loggers=loggers) runner = TrialRunner( search_alg=search_alg, scheduler=scheduler, local_checkpoint_dir=experiments[0].checkpoint_dir, remote_checkpoint_dir=experiments[0].remote_checkpoint_dir, sync_to_cloud=sync_config.sync_to_cloud, stopper=experiments[0].stopper, resume=resume, server_port=server_port, fail_fast=fail_fast, trial_executor=trial_executor, callbacks=callbacks, metric=metric) if not runner.resumed: for exp in experiments: search_alg.add_configurations([exp]) else: logger.info("TrialRunner resumed, ignoring new add_experiment.") if progress_reporter is None: if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter( overwrite=not has_verbosity(Verbosity.V2_TRIAL_NORM)) else: progress_reporter = CLIReporter() if not progress_reporter.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the reporter you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your reporter or from your call to `tune.run()`") progress_reporter.set_total_samples(search_alg.total_samples) # User Warning for GPUs if trial_executor.has_gpus(): if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial: # "gpu" is manually set. pass elif _check_default_resources_override(experiments[0].run_identifier): # "default_resources" is manually overridden. pass else: logger.warning("Tune detects GPUs, but no trials are using GPUs. " "To enable trials to use GPUs, set " "tune.run(resources_per_trial={'gpu': 1}...) " "which allows Tune to expose 1 GPU to each trial. " "You can also override " "`Trainable.default_resource_request` if using the " "Trainable API.") tune_start = time.time() while not runner.is_finished(): runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) tune_taken = time.time() - tune_start try: runner.checkpoint(force=True) except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter, done=True) wait_for_sync() runner.cleanup_trials() incomplete_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: incomplete_trials += [trial] if incomplete_trials: if raise_on_failed_trial: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) all_taken = time.time() - all_start if has_verbosity(Verbosity.V1_EXPERIMENT): logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") trials = runner.get_trials() return ExperimentAnalysis(runner.checkpoint_file, trials=trials, default_metric=metric, default_mode=mode)