Python RayTrialExecutor示例

def test_queue_trials(start_connected_emptyhead_cluster):
    """Tests explicit oversubscription for autoscaling.

    Tune oversubscribes a trial when `queue_trials=True`, but
    does not block other trials from running.
    """
    cluster = start_connected_emptyhead_cluster
    runner = TrialRunner()

    def create_trial(cpu, gpu=0):
        kwargs = {
            "resources": Resources(cpu=cpu, gpu=gpu),
            "stopping_criterion": {
                "training_iteration": 3
            }
        }
        return Trial("__fake", **kwargs)

    runner.add_trial(create_trial(cpu=1))
    with pytest.raises(TuneError):
        runner.step()  # run 1

    del runner

    executor = RayTrialExecutor(queue_trials=True)
    runner = TrialRunner(trial_executor=executor)
    cluster.add_node(num_cpus=2)
    cluster.wait_for_nodes()

    cpu_only = create_trial(cpu=1)
    runner.add_trial(cpu_only)
    runner.step()  # add cpu_only trial

    gpu_trial = create_trial(cpu=1, gpu=1)
    runner.add_trial(gpu_trial)
    runner.step()  # queue gpu_trial

    # This tests that the cpu_only trial should bypass the queued trial.
    for i in range(3):
        runner.step()
    assert cpu_only.status == Trial.TERMINATED
    assert gpu_trial.status == Trial.RUNNING

    # Scale up
    cluster.add_node(num_cpus=1, num_gpus=1)
    cluster.wait_for_nodes()

    for i in range(3):
        runner.step()
    assert gpu_trial.status == Trial.TERMINATED

示例#2

显示文件

文件： test_ray_trial_executor.py 项目： stjordanis/ray

    def _testPauseAndStart(self, result_buffer_length):
        """Tests that unpausing works for trials being processed."""
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = f"{result_buffer_length}"
        os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1"

        # Need a new trial executor so the ENV vars are parsed again
        self.trial_executor = RayTrialExecutor()

        base = max(result_buffer_length, 1)

        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base)
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base * 2)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

示例#3

显示文件

    def __init__(self,
                 search_alg,
                 scheduler=None,
                 launch_web_server=False,
                 metadata_checkpoint_dir=None,
                 server_port=TuneServer.DEFAULT_PORT,
                 verbose=True,
                 queue_trials=False,
                 trial_executor=None):
        """Initializes a new TrialRunner.

        Args:
            search_alg (SearchAlgorithm): SearchAlgorithm for generating
                Trial objects.
            scheduler (TrialScheduler): Defaults to FIFOScheduler.
            launch_web_server (bool): Flag for starting TuneServer
            metadata_checkpoint_dir (str): Path where
                global checkpoints are stored and restored from.
            server_port (int): Port number for launching TuneServer
            verbose (bool): Flag for verbosity. If False, trial results
                will not be output.
            queue_trials (bool): Whether to queue trials when the cluster does
                not currently have enough resources to launch one. This should
                be set to True when running on an autoscaling cluster to enable
                automatic scale-up.
            trial_executor (TrialExecutor): Defaults to RayTrialExecutor.
        """
        self._search_alg = search_alg
        self._scheduler_alg = scheduler or FIFOScheduler()
        self.trial_executor = trial_executor or \
            RayTrialExecutor(queue_trials=queue_trials)

        # For debugging, it may be useful to halt trials after some time has
        # elapsed. TODO(ekl) consider exposing this in the API.
        self._global_time_limit = float(
            os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float('inf')))
        self._total_time = 0
        self._iteration = 0
        self._verbose = verbose
        self._queue_trials = queue_trials

        self._server = None
        self._server_port = server_port
        if launch_web_server:
            self._server = TuneServer(self, self._server_port)

        self._trials = []
        self._stop_queue = []
        self._metadata_checkpoint_dir = metadata_checkpoint_dir

示例#4

显示文件

    def _exploit_trial(self, trial_executor: RayTrialExecutor, trial: Trial,
                       trial_to_clone: Trial):
        """
        Transfers perturbed state from trial_to_clone -> trial.
        If specified, also logs the updated hyperparam state.
        """

        trial_state = self._trials_states_dict[trial]
        new_state = self._trials_states_dict[trial_to_clone]
        if not new_state.last_checkpoint:
            logger.info(
                "[pbt]: no checkpoint for trial. Skip exploit for Trial {}".
                format(trial))
            return
        new_config = explore(trial_to_clone.config, self._hyperparam_mutations,
                             self._hyperparam_mutate_probability,
                             self._explore_func)
        logger.info(
            "[exploit] transferring weights from trial {} (score {}) -> {} (score {})"
            .format(trial_to_clone, new_state.last_score, trial,
                    trial_state.last_score))

        if self._log_config:
            self._log_config_on_step(trial_state, new_state, trial,
                                     trial_to_clone, new_config)

        new_tag = make_experiment_tag(trial_state.orig_tag, new_config,
                                      self._hyperparam_mutations)
        reset_successful = trial_executor.reset_trial(trial, new_config,
                                                      new_tag)

        if reset_successful:
            trial_executor.restore(
                trial, Checkpoint.from_object(new_state.last_checkpoint))
        else:
            trial_executor.stop_trial(trial, stop_logger=False)
            trial.config = new_config
            trial.experiment_tag = new_tag
            trial_executor.start_trial(
                trial, Checkpoint.from_object(new_state.last_checkpoint))

        # TODO: move to Exploiter
        new_state.num_steps = 0
        trial_state.num_steps = 0
        new_state.num_explorations = 0
        trial_state.num_explorations += 1
        self._num_explorations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time

示例#5

显示文件

文件： test_trial_runner_3.py 项目： ijrsvt/ray

    def testCheckpointAtEndNotBuffered(self):
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "7"
        os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "0.5"

        def num_checkpoints(trial):
            return sum(
                item.startswith("checkpoint_") for item in os.listdir(trial.logdir)
            )

        ray.init(num_cpus=2)

        trial = Trial(
            "__fake",
            checkpoint_at_end=True,
            stopping_criterion={"training_iteration": 4},
        )
        runner = TrialRunner(
            local_checkpoint_dir=self.tmpdir,
            checkpoint_period=0,
            trial_executor=RayTrialExecutor(result_buffer_length=7),
        )
        runner.add_trial(trial)

        runner.step()  # start trial

        runner.step()  # run iteration 1
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 1)
        self.assertEqual(num_checkpoints(trial), 0)

        runner.step()  # run iteration 2
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 2)
        self.assertEqual(num_checkpoints(trial), 0)

        runner.step()  # run iteration 3
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 3)
        self.assertEqual(num_checkpoints(trial), 0)

        runner.step()  # run iteration 4
        self.assertEqual(trial.last_result[TRAINING_ITERATION], 4)
        self.assertEqual(num_checkpoints(trial), 1)

示例#6

显示文件

    def test_default_resource_request(self):
        config = DEFAULT_CONFIG.copy()
        config["model"]["fcnet_hiddens"] = [10]
        config["num_workers"] = 2
        config["num_cpus_per_worker"] = 2
        # 3 Trials: Can only run 1 at a time (num_cpus=6; needed: 5).
        config["lr"] = tune.grid_search([0.1, 0.01, 0.001])
        config["env"] = "CartPole-v0"
        config["framework"] = "torch"
        config["placement_strategy"] = "SPREAD"

        global trial_executor
        trial_executor = RayTrialExecutor(reuse_actors=False)

        tune.run(
            "PG",
            config=config,
            stop={"training_iteration": 2},
            trial_executor=trial_executor,
            callbacks=[_TestCallback()],
            verbose=2,
        )

示例#7

显示文件

文件： test_ray_trial_executor.py 项目： miqdigital/ray

class RayTrialExecutorTest(unittest.TestCase):
    def setUp(self):
        self.trial_executor = RayTrialExecutor()
        ray.init(num_cpus=2, ignore_reinit_error=True)
        _register_all()  # Needed for flaky tests

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects

    def _simulate_starting_trial(self, trial):
        future_result = self.trial_executor.get_next_executor_event(
            live_trials={trial}, next_trial_exists=True)
        assert future_result.type == ExecutorEventType.PG_READY
        self.assertTrue(self.trial_executor.start_trial(trial))
        self.assertEqual(Trial.RUNNING, trial.status)

    def _simulate_getting_result(self, trial):
        while True:
            future_result = self.trial_executor.get_next_executor_event(
                live_trials={trial}, next_trial_exists=False)
            if future_result.type == ExecutorEventType.TRAINING_RESULT:
                break
        if isinstance(future_result.result, list):
            for r in future_result.result:
                trial.update_last_result(r)
        else:
            trial.update_last_result(future_result.result)

    def _simulate_saving(self, trial):
        checkpoint = self.trial_executor.save(trial, Checkpoint.PERSISTENT)
        self.assertEqual(checkpoint, trial.saving_to)
        self.assertEqual(trial.checkpoint.value, None)
        future_result = self.trial_executor.get_next_executor_event(
            live_trials={trial}, next_trial_exists=False)
        assert future_result.type == ExecutorEventType.SAVING_RESULT
        self.process_trial_save(trial, future_result.result)
        self.assertEqual(checkpoint, trial.checkpoint)

    def testStartStop(self):
        trial = Trial("__fake")
        self._simulate_starting_trial(trial)
        self.trial_executor.stop_trial(trial)

    def testAsyncSave(self):
        """Tests that saved checkpoint value not immediately set."""
        trial = Trial("__fake")
        self._simulate_starting_trial(trial)

        self._simulate_getting_result(trial)

        self._simulate_saving(trial)

        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testSaveRestore(self):
        trial = Trial("__fake")
        self._simulate_starting_trial(trial)

        self._simulate_getting_result(trial)

        self._simulate_saving(trial)

        self.trial_executor.restore(trial)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testPauseResume(self):
        """Tests that pausing works for trials in flight."""
        trial = Trial("__fake")
        self._simulate_starting_trial(trial)

        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)

        self._simulate_starting_trial(trial)

        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testSavePauseResumeErrorRestore(self):
        """Tests that pause checkpoint does not replace restore checkpoint."""
        trial = Trial("__fake")
        self._simulate_starting_trial(trial)

        self._simulate_getting_result(trial)

        # Save
        self._simulate_saving(trial)

        # Train
        self.trial_executor.continue_training(trial)
        self._simulate_getting_result(trial)

        # Pause
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.assertEqual(trial.checkpoint.storage, Checkpoint.MEMORY)

        # Resume
        self._simulate_starting_trial(trial)

        # Error
        trial.set_status(Trial.ERROR)

        # Restore
        self.trial_executor.restore(trial)

        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testStartFailure(self):
        _global_registry.register(TRAINABLE_CLASS, "asdf", None)
        trial = Trial("asdf", resources=Resources(1, 0))
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.ERROR, trial.status)

    def testPauseResume2(self):
        """Tests that pausing works for trials being processed."""
        trial = Trial("__fake")
        self._simulate_starting_trial(trial)

        self._simulate_getting_result(trial)

        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)

        self._simulate_starting_trial(trial)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def _testPauseAndStart(self, result_buffer_length):
        """Tests that unpausing works for trials being processed."""
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = f"{result_buffer_length}"
        os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1"

        # Need a new trial executor so the ENV vars are parsed again
        self.trial_executor = RayTrialExecutor()

        base = max(result_buffer_length, 1)

        trial = Trial("__fake")
        self._simulate_starting_trial(trial)

        self._simulate_getting_result(trial)
        self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base)

        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)

        self._simulate_starting_trial(trial)

        self._simulate_getting_result(trial)
        self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base * 2)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testPauseAndStartNoBuffer(self):
        self._testPauseAndStart(0)

    def testPauseAndStartTrivialBuffer(self):
        self._testPauseAndStart(1)

    def testPauseAndStartActualBuffer(self):
        self._testPauseAndStart(8)

    def testNoResetTrial(self):
        """Tests that reset handles NotImplemented properly."""
        trial = Trial("__fake")
        self._simulate_starting_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {}, "modified_mock")
        self.assertEqual(exists, False)
        self.assertEqual(Trial.RUNNING, trial.status)

    def testResetTrial(self):
        """Tests that reset works as expected."""
        class B(Trainable):
            def step(self):
                return dict(timesteps_this_iter=1, done=True)

            def reset_config(self, config):
                self.config = config
                return True

        trials = self.generate_trials(
            {
                "run": B,
                "config": {
                    "foo": 0
                },
            },
            "grid_search",
        )
        trial = trials[0]
        self._simulate_starting_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {"hi": 1},
                                                 "modified_mock")
        self.assertEqual(exists, True)
        self.assertEqual(trial.config.get("hi"), 1)
        self.assertEqual(trial.experiment_tag, "modified_mock")
        self.assertEqual(Trial.RUNNING, trial.status)

    def testTrialCleanup(self):
        class B(Trainable):
            def step(self):
                print("Step start")
                time.sleep(4)
                print("Step done")
                return dict(my_metric=1, timesteps_this_iter=1, done=True)

            def reset_config(self, config):
                self.config = config
                return True

            def cleanup(self):
                print("Cleanup start")
                time.sleep(4)
                print("Cleanup done")

        # First check if the trials terminate gracefully by default
        trials = self.generate_trials(
            {
                "run": B,
                "config": {
                    "foo": 0
                },
            },
            "grid_search",
        )
        trial = trials[0]
        self._simulate_starting_trial(trial)
        time.sleep(1)
        print("Stop trial")
        self.trial_executor.stop_trial(trial)
        print("Start trial cleanup")
        start = time.time()
        self.trial_executor.cleanup([trial])
        # 4 - 1 + 4.
        self.assertGreaterEqual(time.time() - start, 6)

        # Check forceful termination. It should run for much less than the
        # sleep periods in the Trainable
        trials = self.generate_trials(
            {
                "run": B,
                "config": {
                    "foo": 0
                },
            },
            "grid_search",
        )
        trial = trials[0]
        os.environ["TUNE_FORCE_TRIAL_CLEANUP_S"] = "1"
        self.trial_executor = RayTrialExecutor()
        os.environ["TUNE_FORCE_TRIAL_CLEANUP_S"] = "0"
        self._simulate_starting_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        time.sleep(1)
        print("Stop trial")
        self.trial_executor.stop_trial(trial)
        print("Start trial cleanup")
        start = time.time()
        self.trial_executor.cleanup([trial])
        # less than 1 with some margin.
        self.assertLess(time.time() - start, 2.0)

        # also check if auto-filled metrics were returned
        self.assertIn(PID, trial.last_result)
        self.assertIn(TRIAL_ID, trial.last_result)
        self.assertNotIn("my_metric", trial.last_result)

    @staticmethod
    def generate_trials(spec, name):
        suggester = BasicVariantGenerator()
        suggester.add_configurations({name: spec})
        trials = []
        while not suggester.is_finished():
            trial = suggester.next_trial()
            if trial:
                trials.append(trial)
            else:
                break
        return trials

    def process_trial_save(self, trial, checkpoint_value):
        """Simulates trial runner save."""
        checkpoint = trial.saving_to
        checkpoint.value = checkpoint_value
        trial.on_checkpoint(checkpoint)

示例#8

显示文件

文件： test_ray_trial_executor.py 项目： miqdigital/ray

    def testTrialCleanup(self):
        class B(Trainable):
            def step(self):
                print("Step start")
                time.sleep(4)
                print("Step done")
                return dict(my_metric=1, timesteps_this_iter=1, done=True)

            def reset_config(self, config):
                self.config = config
                return True

            def cleanup(self):
                print("Cleanup start")
                time.sleep(4)
                print("Cleanup done")

        # First check if the trials terminate gracefully by default
        trials = self.generate_trials(
            {
                "run": B,
                "config": {
                    "foo": 0
                },
            },
            "grid_search",
        )
        trial = trials[0]
        self._simulate_starting_trial(trial)
        time.sleep(1)
        print("Stop trial")
        self.trial_executor.stop_trial(trial)
        print("Start trial cleanup")
        start = time.time()
        self.trial_executor.cleanup([trial])
        # 4 - 1 + 4.
        self.assertGreaterEqual(time.time() - start, 6)

        # Check forceful termination. It should run for much less than the
        # sleep periods in the Trainable
        trials = self.generate_trials(
            {
                "run": B,
                "config": {
                    "foo": 0
                },
            },
            "grid_search",
        )
        trial = trials[0]
        os.environ["TUNE_FORCE_TRIAL_CLEANUP_S"] = "1"
        self.trial_executor = RayTrialExecutor()
        os.environ["TUNE_FORCE_TRIAL_CLEANUP_S"] = "0"
        self._simulate_starting_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        time.sleep(1)
        print("Stop trial")
        self.trial_executor.stop_trial(trial)
        print("Start trial cleanup")
        start = time.time()
        self.trial_executor.cleanup([trial])
        # less than 1 with some margin.
        self.assertLess(time.time() - start, 2.0)

        # also check if auto-filled metrics were returned
        self.assertIn(PID, trial.last_result)
        self.assertIn(TRIAL_ID, trial.last_result)
        self.assertNotIn("my_metric", trial.last_result)

示例#9

显示文件

    def testPlacementGroupRequests(self, reuse_actors=False, scheduled=10):
        """In this test we try to start 10 trials but only have resources
        for 2. Placement groups should still be created and PENDING.

        Eventually they should be scheduled sequentially (i.e. in pairs
        of two)."""
        def train(config):
            time.sleep(1)
            now = time.time()
            tune.report(end=now - config["start_time"])

        head_bundle = {"CPU": 4, "GPU": 0, "custom": 0}
        child_bundle = {"custom": 1}

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle])

        trial_executor = RayTrialExecutor(reuse_actors=reuse_actors)

        this = self

        class _TestCallback(Callback):
            def on_step_end(self, iteration, trials, **info):
                num_finished = len([
                    t for t in trials
                    if t.status == Trial.TERMINATED or t.status == Trial.ERROR
                ])

                num_staging = sum(
                    len(s)
                    for s in trial_executor._pg_manager._staging.values())
                num_ready = sum(
                    len(s) for s in trial_executor._pg_manager._ready.values())
                num_in_use = len(trial_executor._pg_manager._in_use_pgs)
                num_cached = len(trial_executor._pg_manager._cached_pgs)

                total_num_tracked = num_staging + num_ready + \
                    num_in_use + num_cached

                num_non_removed_pgs = len([
                    p for pid, p in placement_group_table().items()
                    if p["state"] != "REMOVED"
                ])
                num_removal_scheduled_pgs = len(
                    trial_executor._pg_manager._pgs_for_removal)

                # All trials should be scheduled
                this.assertEqual(scheduled,
                                 min(scheduled, len(trials)),
                                 msg=f"Num trials iter {iteration}")
                # The number of PGs should decrease when trials finish
                this.assertEqual(max(scheduled, len(trials)) - num_finished,
                                 total_num_tracked,
                                 msg=f"Num tracked iter {iteration}")
                # The number of actual placement groups should match this
                this.assertEqual(max(scheduled, len(trials)) - num_finished,
                                 num_non_removed_pgs -
                                 num_removal_scheduled_pgs,
                                 msg=f"Num actual iter {iteration}")

        start = time.time()
        out = tune.run(train,
                       config={"start_time": start},
                       resources_per_trial=placement_group_factory,
                       num_samples=10,
                       trial_executor=trial_executor,
                       callbacks=[_TestCallback()],
                       reuse_actors=reuse_actors,
                       verbose=2)

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 3)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        self._assertCleanup(trial_executor)

示例#10

显示文件

文件： tune.py 项目： stefanbschneider/ray

def run(
        run_or_experiment: Union[str, Callable, Type],
        name: Optional[str] = None,
        metric: Optional[str] = None,
        mode: Optional[str] = None,
        stop: Union[None, Mapping, Stopper, Callable[[str, Mapping],
                                                     bool]] = None,
        time_budget_s: Union[None, int, float, datetime.timedelta] = None,
        config: Optional[Dict[str, Any]] = None,
        resources_per_trial: Union[None, Mapping[str, Union[
            float, int, Mapping]], PlacementGroupFactory] = None,
        num_samples: int = 1,
        local_dir: Optional[str] = None,
        search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None,
        scheduler: Optional[Union[TrialScheduler, str]] = None,
        keep_checkpoints_num: Optional[int] = None,
        checkpoint_score_attr: Optional[str] = None,
        checkpoint_freq: int = 0,
        checkpoint_at_end: bool = False,
        verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
        progress_reporter: Optional[ProgressReporter] = None,
        log_to_file: bool = False,
        trial_name_creator: Optional[Callable[[Trial], str]] = None,
        trial_dirname_creator: Optional[Callable[[Trial], str]] = None,
        sync_config: Optional[SyncConfig] = None,
        export_formats: Optional[Sequence] = None,
        max_failures: int = 0,
        fail_fast: bool = False,
        restore: Optional[str] = None,
        server_port: Optional[int] = None,
        resume: bool = False,
        reuse_actors: bool = False,
        trial_executor: Optional[RayTrialExecutor] = None,
        raise_on_failed_trial: bool = True,
        callbacks: Optional[Sequence[Callback]] = None,
        max_concurrent_trials: Optional[int] = None,
        # Deprecated args
        queue_trials: Optional[bool] = None,
        loggers: Optional[Sequence[Type[Logger]]] = None,
        _remote: Optional[bool] = None,
) -> ExperimentAnalysis:
    """Executes training.

    When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run
    will gracefully shut down and checkpoint the latest experiment state.
    Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step.

    Many aspects of Tune, such as the frequency of global checkpointing,
    maximum pending placement group trials and the path of the result
    directory be configured through environment variables. Refer to
    :ref:`tune-env-vars` for a list of environment variables available.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict|PlacementGroupFactory): Machine resources
            to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``.
            Note that GPUs will not be assigned unless you specify them here.
            Defaults to 1 CPU and 0 GPUs in
            ``Trainable.default_resource_request()``. This can also
            be a PlacementGroupFactory object wrapping arguments to create a
            per-trial placement group.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times. If this is -1, (virtually) infinite
            samples are generated until a stopping condition is met.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher|SearchAlgorithm|str): Search algorithm for
            optimization. You can also use the name of the algorithm.
        scheduler (TrialScheduler|str): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options. You can also use the
            name of the scheduler.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode.
            0 = silent, 1 = only status updates, 2 = status and brief trial
            results, 3 = status and detailed trial results. Defaults to 3.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
            or bool. LOCAL/True restores the checkpoint from the
            local experiment directory, determined
            by ``name`` and ``local_dir``. REMOTE restores the checkpoint
            from ``upload_dir`` (as passed to ``sync_config``).
            PROMPT provides CLI feedback.
            False forces a new experiment. ERRORED_ONLY resets and reruns
            ERRORED trials upon resume - previous trial artifacts will
            be left untouched.  If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        callbacks (list): List of callbacks that will be called at different
            times in the training loop. Must be instances of the
            ``ray.tune.callback.Callback`` class. If not passed,
            `LoggerCallback` and `SyncerCallback` callbacks are automatically
            added.
        max_concurrent_trials (int): Maximum number of trials to run
            concurrently. Must be non-negative. If None or 0, no limit will
            be applied. This is achieved by wrapping the ``search_alg`` in
            a :class:`ConcurrencyLimiter`, and thus setting this argument
            will raise an exception if the ``search_alg`` is already a
            :class:`ConcurrencyLimiter`. Defaults to None.
        _remote (bool): Whether to run the Tune driver in a remote function.
            This is disabled automatically if a custom trial executor is
            passed in. This is enabled by default in Ray client mode.

    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """

    # To be removed in 1.9.
    if queue_trials is not None:
        raise DeprecationWarning(
            "`queue_trials` has been deprecated and is replaced by "
            "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. "
            "Per default at least one Trial is queued at all times, "
            "so you likely don't need to change anything other than "
            "removing this argument from your call to `tune.run()`")

    # NO CODE IS TO BE ADDED ABOVE THIS COMMENT
    # remote_run_kwargs must be defined before any other
    # code is ran to ensure that at this point,
    # `locals()` is equal to args and kwargs
    remote_run_kwargs = locals().copy()
    remote_run_kwargs.pop("_remote")

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run)

        # Make sure tune.run is called on the sever node.
        remote_run = force_on_current_node(remote_run)

        # JupyterNotebooks don't work with remote tune runs out of the box
        # (e.g. via Ray client) as they don't have access to the main
        # process stdout. So we introduce a queue here that accepts
        # callables, which will then be executed on the driver side.
        if isinstance(progress_reporter, JupyterNotebookReporter):
            execute_queue = Queue(actor_options={
                "num_cpus": 0,
                **force_on_current_node(None)
            })
            progress_reporter.set_output_queue(execute_queue)

            def get_next_queue_item():
                try:
                    return execute_queue.get(block=False)
                except Empty:
                    return None

        else:
            # If we don't need a queue, use this dummy get fn instead of
            # scheduling an unneeded actor
            def get_next_queue_item():
                return None

        def _handle_execute_queue():
            execute_item = get_next_queue_item()
            while execute_item:
                if isinstance(execute_item, Callable):
                    execute_item()

                execute_item = get_next_queue_item()

        remote_future = remote_run.remote(_remote=False, **remote_run_kwargs)

        # ray.wait(...)[1] returns futures that are not ready, yet
        while ray.wait([remote_future], timeout=0.2)[1]:
            # Check if we have items to execute
            _handle_execute_queue()

        # Handle queue one last time
        _handle_execute_queue()

        return ray.get(remote_future)

    del remote_run_kwargs

    all_start = time.time()

    if loggers:
        # Raise DeprecationWarning in 1.9, remove in 1.10/1.11
        warnings.warn(
            "The `loggers` argument is deprecated. Please pass the respective "
            "`LoggerCallback` classes to the `callbacks` argument instead. "
            "See https://docs.ray.io/en/latest/tune/api_docs/logging.html")

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    set_verbosity(verbose)

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    if num_samples == -1:
        num_samples = sys.maxsize

    result_buffer_length = None

    # Create scheduler here as we need access to some of its properties
    if isinstance(scheduler, str):
        # importing at top level causes a recursive dependency
        from ray.tune.schedulers import create_scheduler
        scheduler = create_scheduler(scheduler)
    scheduler = scheduler or FIFOScheduler()

    if not scheduler.supports_buffered_results:
        # Result buffering with e.g. a Hyperband scheduler is a bad idea, as
        # hyperband tries to stop trials when processing brackets. With result
        # buffering, we might trigger this multiple times when evaluating
        # a single trial, which leads to unexpected behavior.
        env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "")
        if env_result_buffer_length:
            warnings.warn(
                f"You are using a {type(scheduler)} scheduler, but "
                f"TUNE_RESULT_BUFFER_LENGTH is set "
                f"({env_result_buffer_length}). This can lead to undesired "
                f"and faulty behavior, so the buffer length was forcibly set "
                f"to 1 instead.")
        result_buffer_length = 1

    if isinstance(scheduler,
                  (PopulationBasedTraining,
                   PopulationBasedTrainingReplay)) and not reuse_actors:
        warnings.warn(
            "Consider boosting PBT performance by enabling `reuse_actors` as "
            "well as implementing `reset_config` for Trainable.")

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors, result_buffer_length=result_buffer_length)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                sync_config=sync_config,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if isinstance(search_alg, str):
        # importing at top level causes a recursive dependency
        from ray.tune.suggest import create_searcher
        search_alg = create_searcher(search_alg)

    # if local_mode=True is set during ray.init().
    is_local_mode = ray.worker._mode() == ray.worker.LOCAL_MODE

    if is_local_mode:
        max_concurrent_trials = 1

    if not search_alg:
        search_alg = BasicVariantGenerator(
            max_concurrent=max_concurrent_trials or 0)
    elif max_concurrent_trials:
        if isinstance(search_alg, ConcurrencyLimiter):
            if search_alg.max_concurrent != max_concurrent_trials:
                raise ValueError(
                    "You have specified `max_concurrent_trials="
                    f"{max_concurrent_trials}`, but the `search_alg` is "
                    "already a `ConcurrencyLimiter` with `max_concurrent="
                    f"{search_alg.max_concurrent}. FIX THIS by setting "
                    "`max_concurrent_trials=None`.")
            else:
                logger.warning(
                    "You have specified `max_concurrent_trials="
                    f"{max_concurrent_trials}`, but the `search_alg` is "
                    "already a `ConcurrencyLimiter`. `max_concurrent_trials` "
                    "will be ignored.")
        else:
            if max_concurrent_trials < 1:
                raise ValueError(
                    "`max_concurrent_trials` must be greater or equal than 1, "
                    f"got {max_concurrent_trials}.")
            if isinstance(search_alg, Searcher):
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=max_concurrent_trials)
            elif not is_local_mode:
                logger.warning(
                    "You have passed a `SearchGenerator` instance as the "
                    "`search_alg`, but `max_concurrent_trials` requires a "
                    "`Searcher` instance`. `max_concurrent_trials` "
                    "will be ignored.")

    if isinstance(search_alg, Searcher):
        search_alg = SearchGenerator(search_alg)

    if config and not set_search_properties_backwards_compatible(
            search_alg.set_search_properties, metric, mode, config, **
            experiments[0].public_spec):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    if not scheduler.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    # Create syncer callbacks
    callbacks = create_default_callbacks(
        callbacks, sync_config, metric=metric, loggers=loggers)

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_config=sync_config,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        fail_fast=fail_fast,
        trial_executor=trial_executor,
        callbacks=callbacks,
        metric=metric,
        # Driver should only sync trial checkpoints if
        # checkpoints are not synced to cloud
        driver_sync_trial_checkpoints=not bool(sync_config.upload_dir))

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment but "
                    "updating trial resources.")
        if resources_per_trial:
            runner.update_pending_trial_resources(resources_per_trial)

    progress_reporter = progress_reporter or detect_reporter()

    if not progress_reporter.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the reporter you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your reporter or from your call to `tune.run()`")
    progress_reporter.set_total_samples(search_alg.total_samples)

    # Calls setup on callbacks
    runner.setup_experiments(
        experiments=experiments, total_num_samples=search_alg.total_samples)

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overridden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    original_handler = signal.getsignal(signal.SIGINT)
    state = {signal.SIGINT: False}

    def sigint_handler(sig, frame):
        logger.warning(
            "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. "
            "This will try to checkpoint the experiment state one last time. "
            "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) "
            "to skip. ")
        state[signal.SIGINT] = True
        # Restore original signal handler to react to future SIGINT signals
        signal.signal(signal.SIGINT, original_handler)

    if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")):
        signal.signal(signal.SIGINT, sigint_handler)

    tune_start = time.time()
    progress_reporter.set_start_time(tune_start)
    while not runner.is_finished() and not state[signal.SIGINT]:
        runner.step()
        if has_verbosity(Verbosity.V1_EXPERIMENT):
            _report_progress(runner, progress_reporter)
    tune_taken = time.time() - tune_start

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if has_verbosity(Verbosity.V1_EXPERIMENT):
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial and not state[signal.SIGINT]:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    all_taken = time.time() - all_start
    if has_verbosity(Verbosity.V1_EXPERIMENT):
        logger.info(f"Total run time: {all_taken:.2f} seconds "
                    f"({tune_taken:.2f} seconds for the tuning loop).")

    if state[signal.SIGINT]:
        logger.warning(
            "Experiment has been interrupted, but the most recent state was "
            "saved. You can continue running this experiment by passing "
            "`resume=True` to `tune.run()`")

    trials = runner.get_trials()
    return ExperimentAnalysis(
        runner.checkpoint_file,
        trials=trials,
        default_metric=metric,
        default_mode=mode,
        sync_config=sync_config)

示例#11

显示文件

文件： test_ray_trial_executor.py 项目： marload/ray

class RayExecutorQueueTest(unittest.TestCase):
    def setUp(self):
        self.cluster = Cluster(initialize_head=True,
                               connect=True,
                               head_node_args={
                                   "num_cpus": 1,
                                   "_system_config": {
                                       "num_heartbeats_timeout": 10
                                   }
                               })
        self.trial_executor = RayTrialExecutor(queue_trials=True,
                                               refresh_period=0)
        # Pytest doesn't play nicely with imports
        _register_all()

    def tearDown(self):
        ray.shutdown()
        self.cluster.shutdown()
        _register_all()  # re-register the evicted objects

    def testQueueTrial(self):
        """Tests that reset handles NotImplemented properly."""
        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        cpu_only = create_trial(1, 0)
        self.assertTrue(self.trial_executor.has_resources_for_trial(cpu_only))
        self.trial_executor.start_trial(cpu_only)

        gpu_only = create_trial(0, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_only))

    def testHeadBlocking(self):
        # Once resource requests are deprecated, remove this test
        os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"

        def create_trial(cpu, gpu=0):
            return Trial("__fake", resources=Resources(cpu=cpu, gpu=gpu))

        gpu_trial = create_trial(1, 1)
        self.assertTrue(self.trial_executor.has_resources_for_trial(gpu_trial))
        self.trial_executor.start_trial(gpu_trial)

        # TODO(rliaw): This behavior is probably undesirable, but right now
        #  trials with different resource requirements is not often used.
        cpu_only_trial = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))

        self.cluster.add_node(num_cpus=1, num_gpus=1)
        self.cluster.wait_for_nodes()

        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial))
        self.trial_executor.start_trial(cpu_only_trial)

        cpu_only_trial2 = create_trial(1, 0)
        self.assertTrue(
            self.trial_executor.has_resources_for_trial(cpu_only_trial2))
        self.trial_executor.start_trial(cpu_only_trial2)

        cpu_only_trial3 = create_trial(1, 0)
        self.assertFalse(
            self.trial_executor.has_resources_for_trial(cpu_only_trial3))

示例#12

显示文件

文件： ray_trial_executor_test.py 项目： yuanfeng0905/ray

 def setUp(self):
     self.trial_executor = RayTrialExecutor(queue_trials=False)
     ray.init()

示例#13

显示文件

文件： trial_runner.py 项目： zhangbushi10/ray

    def __init__(self,
                 search_alg=None,
                 scheduler=None,
                 local_checkpoint_dir=None,
                 remote_checkpoint_dir=None,
                 sync_to_cloud=None,
                 stopper=None,
                 resume=False,
                 server_port=None,
                 fail_fast=False,
                 checkpoint_period=None,
                 trial_executor=None,
                 callbacks=None,
                 metric=None):
        self._search_alg = search_alg or BasicVariantGenerator()
        self._scheduler_alg = scheduler or FIFOScheduler()
        self.trial_executor = trial_executor or RayTrialExecutor()
        self._pending_trial_queue_times = {}

        # Setting this to 0 still allows adding one new (pending) trial,
        # but it will prevent us from trying to fill the trial list
        self._max_pending_trials = 0  # Can be updated in `self.add_trial()`

        self._metric = metric

        if "TRIALRUNNER_WALLTIME_LIMIT" in os.environ:
            raise ValueError(
                "The TRIALRUNNER_WALLTIME_LIMIT environment variable is "
                "deprecated. "
                "Use `tune.run(time_budget_s=limit)` instead.")

        self._total_time = 0
        self._iteration = 0
        self._has_errored = False
        self._fail_fast = fail_fast
        if isinstance(self._fail_fast, str):
            self._fail_fast = self._fail_fast.upper()
            if self._fail_fast == TrialRunner.RAISE:
                warnings.warn(
                    "fail_fast='raise' detected. Be careful when using this "
                    "mode as resources (such as Ray processes, "
                    "file descriptors, and temporary files) may not be "
                    "cleaned up properly. To use "
                    "a safer mode, use fail_fast=True.")
            else:
                raise ValueError("fail_fast must be one of {bool, RAISE}. "
                                 f"Got {self._fail_fast}.")

        self._server = None
        self._server_port = server_port
        if server_port is not None:
            self._server = TuneServer(self, self._server_port)

        self._trials = []
        self._cached_trial_decisions = {}
        self._queued_trial_decisions = {}
        self._updated_queue = False

        self._stop_queue = []
        self._should_stop_experiment = False  # used by TuneServer
        self._local_checkpoint_dir = local_checkpoint_dir

        if self._local_checkpoint_dir:
            os.makedirs(self._local_checkpoint_dir, exist_ok=True)

        self._remote_checkpoint_dir = remote_checkpoint_dir
        self._syncer = get_cloud_syncer(local_checkpoint_dir,
                                        remote_checkpoint_dir, sync_to_cloud)
        self._stopper = stopper or NoopStopper()
        self._resumed = False

        if self._validate_resume(resume_type=resume):
            errored_only = False
            if isinstance(resume, str):
                errored_only = resume.upper() == "ERRORED_ONLY"
            try:
                self.resume(run_errored_only=errored_only)
                self._resumed = True
            except Exception as e:
                if has_verbosity(Verbosity.V3_TRIAL_DETAILS):
                    logger.error(str(e))
                logger.exception("Runner restore failed.")
                if self._fail_fast:
                    raise
                logger.info("Restarting experiment.")
        else:
            logger.debug("Starting a new experiment.")

        self._start_time = time.time()
        self._last_checkpoint_time = -float("inf")

        self._session_str = datetime.fromtimestamp(
            self._start_time).strftime("%Y-%m-%d_%H-%M-%S")
        self.checkpoint_file = None
        if self._local_checkpoint_dir:
            self.checkpoint_file = os.path.join(
                self._local_checkpoint_dir,
                TrialRunner.CKPT_FILE_TMPL.format(self._session_str))

        self._callbacks = CallbackList(callbacks or [])

        self._callbacks.setup()

        if checkpoint_period is None:
            checkpoint_period = os.getenv("TUNE_GLOBAL_CHECKPOINT_S", "auto")

        self._checkpoint_period = checkpoint_period
        self._checkpoint_manager = self._create_checkpoint_manager()

示例#14

显示文件

def run(run_or_experiment,
        name=None,
        stop=None,
        config=None,
        resources_per_trial=None,
        num_samples=1,
        local_dir=None,
        upload_dir=None,
        trial_name_creator=None,
        loggers=None,
        sync_to_cloud=None,
        sync_to_driver=None,
        checkpoint_freq=0,
        checkpoint_at_end=False,
        sync_on_checkpoint=True,
        keep_checkpoints_num=None,
        checkpoint_score_attr=None,
        global_checkpoint_period=10,
        export_formats=None,
        max_failures=0,
        restore=None,
        search_alg=None,
        scheduler=None,
        with_server=False,
        server_port=TuneServer.DEFAULT_PORT,
        verbose=2,
        progress_reporter=None,
        resume=False,
        queue_trials=False,
        reuse_actors=False,
        trial_executor=None,
        raise_on_failed_trial=True,
        return_trials=False,
        ray_auto_init=True,
        sync_function=None):
    """Executes training.

    Args:
        run_or_experiment (function|class|str|Experiment): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec.
        name (str): Name of experiment.
        stop (dict|func): The stopping criteria. If dict, the keys may be
            any field in the return result of 'train()', whichever is
            reached first. If function, it must take (trial_id, result) as
            arguments and return a boolean (True if trial should be stopped,
            False otherwise).
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict): Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be
            assigned unless you specify them here. Defaults to 1 CPU and 0
            GPUs in ``Trainable.default_resource_request()``.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        upload_dir (str): Optional URI to sync training results and checkpoints
            to (e.g. ``s3://bucket`` or ``gs://bucket``).
        trial_name_creator (func): Optional function for generating
            the trial string representation.
        loggers (list): List of logger creators to be used with
            each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS.
            See `ray/tune/logger.py`.
        sync_to_cloud (func|str): Function for syncing the local_dir to and
            from upload_dir. If string, then it must be a string template that
            includes `{source}` and `{target}` for the syncer to run. If not
            provided, the sync command defaults to standard S3 or gsutil sync
            commands.
        sync_to_driver (func|str|bool): Function for syncing trial logdir from
            remote node to local. If string, then it must be a string template
            that includes `{source}` and `{target}` for the syncer to run.
            If True or not provided, it defaults to using rsync. If False,
            syncing to driver is disabled.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
        sync_on_checkpoint (bool): Force sync-down of trial checkpoint to
            driver. If set to False, checkpoint syncing from worker to driver
            is asynchronous and best-effort. This does not affect persistent
            storage syncing. Defaults to True.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        global_checkpoint_period (int): Seconds between global checkpointing.
            This does not affect `checkpoint_freq`, which specifies frequency
            for individual trials.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 3.
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", or bool.
            LOCAL/True restores the checkpoint from the local_checkpoint_dir.
            REMOTE restores the checkpoint from remote_checkpoint_dir.
            PROMPT provides CLI feedback. False forces a new
            experiment. If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        ray_auto_init (bool): Automatically starts a local Ray cluster
            if using a RayTrialExecutor (which is the default) and
            if Ray is not initialized. Defaults to True.
        sync_function: Deprecated. See `sync_to_cloud` and
            `sync_to_driver`.

    Returns:
        List of Trial objects.

    Raises:
        TuneError if any trials failed and `raise_on_failed_trial` is True.

    Examples:
        >>> tune.run(mytrainable, scheduler=PopulationBasedTraining())

        >>> tune.run(mytrainable, num_samples=5, reuse_actors=True)

        >>> tune.run(
        >>>     "PG",
        >>>     num_samples=5,
        >>>     config={
        >>>         "env": "CartPole-v0",
        >>>         "lr": tune.sample_from(lambda _: np.random.rand())
        >>>     }
        >>> )
    """
    trial_executor = trial_executor or RayTrialExecutor(
        queue_trials=queue_trials,
        reuse_actors=reuse_actors,
        ray_auto_init=ray_auto_init)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]
    if len(experiments) > 1:
        logger.info(
            "Running multiple concurrent experiments is experimental and may "
            "not work with certain features.")
    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            run_identifier = Experiment.register_if_needed(exp)
            experiments[i] = Experiment(
                name=name,
                run=run_identifier,
                stop=stop,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                upload_dir=upload_dir,
                sync_to_driver=sync_to_driver,
                trial_name_creator=trial_name_creator,
                loggers=loggers,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                sync_on_checkpoint=sync_on_checkpoint,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore,
                sync_function=sync_function)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if sync_to_cloud:
        for exp in experiments:
            assert exp.remote_checkpoint_dir, (
                "Need `upload_dir` if `sync_to_cloud` given.")

    runner = TrialRunner(
        search_alg=search_alg or BasicVariantGenerator(),
        scheduler=scheduler or FIFOScheduler(),
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_to_cloud=sync_to_cloud,
        checkpoint_period=global_checkpoint_period,
        resume=resume,
        launch_web_server=with_server,
        server_port=server_port,
        verbose=bool(verbose > 1),
        trial_executor=trial_executor)

    for exp in experiments:
        runner.add_experiment(exp)

    if progress_reporter is None:
        if IS_NOTEBOOK:
            progress_reporter = JupyterNotebookReporter(overwrite=verbose < 2)
        else:
            progress_reporter = CLIReporter()

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overriden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    while not runner.is_finished():
        runner.step()
        if verbose:
            _report_progress(runner, progress_reporter)

    try:
        runner.checkpoint(force=True)
    except Exception:
        logger.exception("Trial Runner checkpointing failed.")

    if verbose:
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    trials = runner.get_trials()
    if return_trials:
        return trials
    logger.info("Returning an analysis object by default. You can call "
                "`analysis.trials` to retrieve a list of trials. "
                "This message will be removed in future versions of Tune.")
    return ExperimentAnalysis(runner.checkpoint_file, trials=trials)

示例#15

显示文件

文件： ray_trial_executor_test.py 项目： jamescasbon/ray

 def setUp(self):
     self.trial_executor = RayTrialExecutor(queue_trials=False)
     ray.init()

示例#16

显示文件

文件： ray_trial_executor_test.py 项目： jamescasbon/ray

class RayTrialExecutorTest(unittest.TestCase):
    def setUp(self):
        self.trial_executor = RayTrialExecutor(queue_trials=False)
        ray.init()

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects

    def testStartStop(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        running = self.trial_executor.get_running_trials()
        self.assertEqual(1, len(running))
        self.trial_executor.stop_trial(trial)

    def testSaveRestore(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.save(trial, Checkpoint.DISK)
        self.trial_executor.restore(trial)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testPauseResume(self):
        """Tests that pausing works for trials in flight."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testStartFailure(self):
        _global_registry.register(TRAINABLE_CLASS, "asdf", None)
        trial = Trial("asdf", resources=Resources(1, 0))
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.ERROR, trial.status)

    def testPauseResume2(self):
        """Tests that pausing works for trials being processed."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.fetch_result(trial)
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testNoResetTrial(self):
        """Tests that reset handles NotImplemented properly."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {}, "modified_mock")
        self.assertEqual(exists, False)
        self.assertEqual(Trial.RUNNING, trial.status)

    def testResetTrial(self):
        """Tests that reset works as expected."""

        class B(Trainable):
            def _train(self):
                return dict(timesteps_this_iter=1, done=True)

            def reset_config(self, config):
                self.config = config
                return True

        trials = self.generate_trials({
            "run": B,
            "config": {
                "foo": 0
            },
        }, "grid_search")
        trial = trials[0]
        self.trial_executor.start_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {"hi": 1},
                                                 "modified_mock")
        self.assertEqual(exists, True)
        self.assertEqual(trial.config.get("hi"), 1)
        self.assertEqual(trial.experiment_tag, "modified_mock")
        self.assertEqual(Trial.RUNNING, trial.status)

    def generate_trials(self, spec, name):
        suggester = BasicVariantGenerator()
        suggester.add_configurations({name: spec})
        return suggester.next_trials()

示例#17

显示文件

文件： tune.py 项目： ehu-ai/ray

def run(run_or_experiment,
        name=None,
        stop=None,
        config=None,
        resources_per_trial=None,
        num_samples=1,
        local_dir=None,
        upload_dir=None,
        trial_name_creator=None,
        loggers=None,
        log_to_file=False,
        sync_to_cloud=None,
        sync_to_driver=None,
        checkpoint_freq=0,
        checkpoint_at_end=False,
        sync_on_checkpoint=True,
        keep_checkpoints_num=None,
        checkpoint_score_attr=None,
        global_checkpoint_period=10,
        export_formats=None,
        max_failures=0,
        fail_fast=False,
        restore=None,
        search_alg=None,
        scheduler=None,
        with_server=False,
        server_port=TuneServer.DEFAULT_PORT,
        verbose=2,
        progress_reporter=None,
        resume=False,
        queue_trials=False,
        reuse_actors=False,
        trial_executor=None,
        raise_on_failed_trial=True,
        return_trials=False,
        ray_auto_init=True):
    """Executes training.

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict): Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be
            assigned unless you specify them here. Defaults to 1 CPU and 0
            GPUs in ``Trainable.default_resource_request()``.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        upload_dir (str): Optional URI to sync training results and checkpoints
            to (e.g. ``s3://bucket`` or ``gs://bucket``).
        trial_name_creator (func): Optional function for generating
            the trial string representation.
        loggers (list): List of logger creators to be used with
            each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS.
            See `ray/tune/logger.py`.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        sync_to_cloud (func|str): Function for syncing the local_dir to and
            from upload_dir. If string, then it must be a string template that
            includes `{source}` and `{target}` for the syncer to run. If not
            provided, the sync command defaults to standard S3 or gsutil sync
            commands. By default local_dir is synced to remote_dir every 300
            seconds. To change this, set the TUNE_CLOUD_SYNC_S
            environment variable in the driver machine.
        sync_to_driver (func|str|bool): Function for syncing trial logdir from
            remote node to local. If string, then it must be a string template
            that includes `{source}` and `{target}` for the syncer to run.
            If True or not provided, it defaults to using rsync. If False,
            syncing to driver is disabled.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        sync_on_checkpoint (bool): Force sync-down of trial checkpoint to
            driver. If set to False, checkpoint syncing from worker to driver
            is asynchronous and best-effort. This does not affect persistent
            storage syncing. Defaults to True.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        global_checkpoint_period (int): Seconds between global checkpointing.
            This does not affect `checkpoint_freq`, which specifies frequency
            for individual trials.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 3.
        fail_fast (bool): Whether to fail upon the first error.
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        search_alg (Searcher): Search algorithm for optimization.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", or bool.
            LOCAL/True restores the checkpoint from the local_checkpoint_dir.
            REMOTE restores the checkpoint from remote_checkpoint_dir.
            PROMPT provides CLI feedback. False forces a new
            experiment. If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        ray_auto_init (bool): Automatically starts a local Ray cluster
            if using a RayTrialExecutor (which is the default) and
            if Ray is not initialized. Defaults to True.



    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})
    """
    config = config or {}

    trial_executor = trial_executor or RayTrialExecutor(
        queue_trials=queue_trials,
        reuse_actors=reuse_actors,
        ray_auto_init=ray_auto_init)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                upload_dir=upload_dir,
                sync_to_driver=sync_to_driver,
                trial_name_creator=trial_name_creator,
                loggers=loggers,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                sync_on_checkpoint=sync_on_checkpoint,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if sync_to_cloud:
        for exp in experiments:
            assert exp.remote_checkpoint_dir, (
                "Need `upload_dir` if `sync_to_cloud` given.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if issubclass(type(search_alg), Searcher):
        search_alg = SearchGenerator(search_alg)

    if not search_alg:
        search_alg = BasicVariantGenerator()

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler or FIFOScheduler(),
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_to_cloud=sync_to_cloud,
        stopper=experiments[0].stopper,
        checkpoint_period=global_checkpoint_period,
        resume=resume,
        launch_web_server=with_server,
        server_port=server_port,
        verbose=bool(verbose > 1),
        fail_fast=fail_fast,
        trial_executor=trial_executor)

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment.")

    if progress_reporter is None:
        if IS_NOTEBOOK:
            progress_reporter = JupyterNotebookReporter(overwrite=verbose < 2)
        else:
            progress_reporter = CLIReporter()

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overriden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    while not runner.is_finished():
        runner.step()
        if verbose:
            _report_progress(runner, progress_reporter)

    try:
        runner.checkpoint(force=True)
    except Exception:
        logger.exception("Trial Runner checkpointing failed.")

    if verbose:
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup_trials()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    trials = runner.get_trials()
    if return_trials:
        return trials
    return ExperimentAnalysis(runner.checkpoint_file, trials=trials)

示例#18

显示文件

文件： test_trial_runner_pg.py 项目： ijrsvt/ray

    def testPlacementGroupRequests(self, reuse_actors=False, scheduled=10):
        """In this test we try to start 10 trials but only have resources
        for 2. Placement groups should still be created and PENDING.

        Eventually they should be scheduled sequentially (i.e. in pairs
        of two)."""
        # Since we check per-step placement groups, set the reconcilation
        # interval to 0
        os.environ["TUNE_PLACEMENT_GROUP_RECON_INTERVAL"] = "0"

        def train(config):
            time.sleep(1)
            now = time.time()
            tune.report(end=now - config["start_time"])

        head_bundle = {"CPU": 4, "GPU": 0, "custom": 0}
        child_bundle = {"custom": 1}
        # Manually calculated number of parallel trials
        max_num_parallel = 2

        placement_group_factory = PlacementGroupFactory(
            [head_bundle, child_bundle, child_bundle])

        trial_executor = RayTrialExecutor(reuse_actors=reuse_actors)

        this = self

        class _TestCallback(Callback):
            def on_step_end(self, iteration, trials, **info):
                num_finished = len([
                    t for t in trials
                    if t.status == Trial.TERMINATED or t.status == Trial.ERROR
                ])

                num_staging = sum(
                    len(s)
                    for s in trial_executor._pg_manager._staging.values())
                num_ready = sum(
                    len(s) for s in trial_executor._pg_manager._ready.values())
                num_in_use = len(trial_executor._pg_manager._in_use_pgs)
                num_cached = len(trial_executor._pg_manager._cached_pgs)

                total_num_tracked = num_staging + num_ready + num_in_use + num_cached

                num_non_removed_pgs = len([
                    p for pid, p in placement_group_table().items()
                    if p["state"] != "REMOVED"
                ])
                num_removal_scheduled_pgs = len(
                    trial_executor._pg_manager._pgs_for_removal)

                # All trials should be scheduled
                this.assertEqual(
                    scheduled,
                    min(scheduled, len(trials)),
                    msg=f"Num trials iter {iteration}",
                )

                # The following two tests were relaxed for reuse_actors=True
                # so that up to `max_num_parallel` more placement groups can
                # exist than we would expect. This is because caching
                # relies on reconciliation for cleanup to avoid overscheduling
                # of new placement groups.
                num_parallel_reuse = int(reuse_actors) * max_num_parallel

                # The number of PGs should decrease when trials finish
                this.assertGreaterEqual(
                    max(scheduled, len(trials)) - num_finished +
                    num_parallel_reuse,
                    total_num_tracked,
                    msg=f"Num tracked iter {iteration}",
                )

                # The number of actual placement groups should match this
                this.assertGreaterEqual(
                    max(scheduled, len(trials)) - num_finished +
                    num_parallel_reuse,
                    num_non_removed_pgs - num_removal_scheduled_pgs,
                    msg=f"Num actual iter {iteration}",
                )

        start = time.time()
        out = tune.run(
            train,
            config={"start_time": start},
            resources_per_trial=placement_group_factory,
            num_samples=10,
            trial_executor=trial_executor,
            callbacks=[_TestCallback()],
            reuse_actors=reuse_actors,
            verbose=2,
        )

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 3)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        self._assertCleanup(trial_executor)

示例#19

显示文件

文件： ray_trial_executor_test.py 项目： yuanfeng0905/ray

class RayTrialExecutorTest(unittest.TestCase):
    def setUp(self):
        self.trial_executor = RayTrialExecutor(queue_trials=False)
        ray.init()

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects

    def _get_trials(self):
        trials = self.generate_trials(
            {
                "run": "PPO",
                "config": {
                    "bar": {
                        "grid_search": [True, False]
                    },
                    "foo": {
                        "grid_search": [1, 2, 3]
                    },
                },
            }, "grid_search")
        return list(trials)

    def testStartStop(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        running = self.trial_executor.get_running_trials()
        self.assertEqual(1, len(running))
        self.trial_executor.stop_trial(trial)

    def testSaveRestore(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.save(trial, Checkpoint.DISK)
        self.trial_executor.restore(trial)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def generate_trials(self, spec, name):
        suggester = BasicVariantGenerator({name: spec})
        return suggester.next_trials()

示例#20

显示文件

文件： trial_runner.py 项目： wsjeon/ray

    def __init__(self,
                 search_alg=None,
                 scheduler=None,
                 launch_web_server=False,
                 local_checkpoint_dir=None,
                 remote_checkpoint_dir=None,
                 sync_to_cloud=None,
                 resume=False,
                 server_port=TuneServer.DEFAULT_PORT,
                 verbose=True,
                 checkpoint_period=10,
                 trial_executor=None):
        """Initializes a new TrialRunner.

        Args:
            search_alg (SearchAlgorithm): SearchAlgorithm for generating
                Trial objects.
            scheduler (TrialScheduler): Defaults to FIFOScheduler.
            launch_web_server (bool): Flag for starting TuneServer
            local_checkpoint_dir (str): Path where
                global checkpoints are stored and restored from.
            remote_checkpoint_dir (str): Remote path where
                global checkpoints are stored and restored from. Used
                if `resume` == REMOTE.
            resume (str|False): see `tune.py:run`.
            sync_to_cloud (func|str): see `tune.py:run`.
            server_port (int): Port number for launching TuneServer.
            verbose (bool): Flag for verbosity. If False, trial results
                will not be output.
            trial_executor (TrialExecutor): Defaults to RayTrialExecutor.
        """
        self._search_alg = search_alg or BasicVariantGenerator()
        self._scheduler_alg = scheduler or FIFOScheduler()
        self.trial_executor = trial_executor or RayTrialExecutor()

        # For debugging, it may be useful to halt trials after some time has
        # elapsed. TODO(ekl) consider exposing this in the API.
        self._global_time_limit = float(
            os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf")))
        self._total_time = 0
        self._iteration = 0
        self._verbose = verbose

        self._server = None
        self._server_port = server_port
        if launch_web_server:
            self._server = TuneServer(self, self._server_port)

        self._trials = []
        self._stop_queue = []
        self._local_checkpoint_dir = local_checkpoint_dir

        if self._local_checkpoint_dir and not os.path.exists(
                self._local_checkpoint_dir):
            os.makedirs(self._local_checkpoint_dir)

        self._remote_checkpoint_dir = remote_checkpoint_dir
        self._syncer = get_syncer(local_checkpoint_dir, remote_checkpoint_dir,
                                  sync_to_cloud)

        self._resumed = False

        if self._validate_resume(resume_type=resume):
            try:
                self.resume()
                logger.info("Resuming trial.")
                self._resumed = True
            except Exception:
                logger.exception(
                    "Runner restore failed. Restarting experiment.")
        else:
            logger.info("Starting a new experiment.")

        self._start_time = time.time()
        self._last_checkpoint_time = -float("inf")
        self._checkpoint_period = checkpoint_period
        self._session_str = datetime.fromtimestamp(
            self._start_time).strftime("%Y-%m-%d_%H-%M-%S")

示例#21

显示文件

文件： tune.py 项目： ipark-CS/ray

def run(
    run_or_experiment,
    name=None,
    metric=None,
    mode=None,
    stop=None,
    time_budget_s=None,
    config=None,
    resources_per_trial=None,
    num_samples=1,
    local_dir=None,
    search_alg=None,
    scheduler=None,
    keep_checkpoints_num=None,
    checkpoint_score_attr=None,
    checkpoint_freq=0,
    checkpoint_at_end=False,
    verbose=2,
    progress_reporter=None,
    loggers=None,
    log_to_file=False,
    trial_name_creator=None,
    trial_dirname_creator=None,
    sync_config=None,
    export_formats=None,
    max_failures=0,
    fail_fast=False,
    restore=None,
    server_port=None,
    resume=False,
    reuse_actors=False,
    trial_executor=None,
    raise_on_failed_trial=True,
    # Deprecated args
    ray_auto_init=None,
    run_errored_only=None,
    queue_trials=None,
    global_checkpoint_period=None,
    with_server=None,
    upload_dir=None,
    sync_to_cloud=None,
    sync_to_driver=None,
    sync_on_checkpoint=None,
):
    """Executes training.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict): Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be
            assigned unless you specify them here. Defaults to 1 CPU and 0
            GPUs in ``Trainable.default_resource_request()``.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher): Search algorithm for optimization.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        loggers (list): List of logger creators to be used with
            each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS.
            See `ray/tune/logger.py`.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 3.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
            or bool. LOCAL/True restores the checkpoint from the
            local_checkpoint_dir, determined
            by `name` and `local_dir`. REMOTE restores the checkpoint
            from remote_checkpoint_dir. PROMPT provides CLI feedback.
            False forces a new experiment. ERRORED_ONLY resets and reruns
            ERRORED trials upon resume - previous trial artifacts will
            be left untouched.  If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.


    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """
    if global_checkpoint_period:
        raise ValueError("global_checkpoint_period is deprecated. Set env var "
                         "'TUNE_GLOBAL_CHECKPOINT_S' instead.")
    if queue_trials:
        raise ValueError(
            "queue_trials is deprecated. "
            "Set env var 'TUNE_DISABLE_QUEUE_TRIALS=1' instead to "
            "disable queuing behavior.")
    if ray_auto_init:
        raise ValueError("ray_auto_init is deprecated. "
                         "Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or "
                         "call 'ray.init' before calling 'tune.run'.")
    if with_server:
        raise ValueError(
            "with_server is deprecated. It is now enabled by default "
            "if 'server_port' is not None.")
    if sync_on_checkpoint or sync_to_cloud or sync_to_driver or upload_dir:
        raise ValueError(
            "sync_on_checkpoint / sync_to_cloud / sync_to_driver / "
            "upload_dir must now be set via `tune.run("
            "sync_config=SyncConfig(...)`. See `ray.tune.SyncConfig` for "
            "more details.")

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                upload_dir=sync_config.upload_dir,
                sync_to_driver=sync_config.sync_to_driver,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                loggers=loggers,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                sync_on_checkpoint=sync_config.sync_on_checkpoint,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if sync_config.sync_to_cloud:
        for exp in experiments:
            assert exp.remote_checkpoint_dir, (
                "Need `upload_dir` if `sync_to_cloud` given.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if issubclass(type(search_alg), Searcher):
        search_alg = SearchGenerator(search_alg)

    if not search_alg:
        search_alg = BasicVariantGenerator()

    if config and not search_alg.set_search_properties(metric, mode, config):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    scheduler = scheduler or FIFOScheduler()
    if not scheduler.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_to_cloud=sync_config.sync_to_cloud,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        verbose=bool(verbose > 1),
        fail_fast=fail_fast,
        trial_executor=trial_executor)

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment.")

    if progress_reporter is None:
        if IS_NOTEBOOK:
            progress_reporter = JupyterNotebookReporter(overwrite=verbose < 2)
        else:
            progress_reporter = CLIReporter()

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overriden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    while not runner.is_finished():
        runner.step()
        if verbose:
            _report_progress(runner, progress_reporter)

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if verbose:
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup_trials()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    trials = runner.get_trials()
    return ExperimentAnalysis(runner.checkpoint_file,
                              trials=trials,
                              default_metric=metric,
                              default_mode=mode)

示例#22

显示文件

文件： trial_runner.py 项目： linwukang/ray

    def __init__(self,
                 search_alg=None,
                 scheduler=None,
                 local_checkpoint_dir=None,
                 remote_checkpoint_dir=None,
                 sync_to_cloud=None,
                 stopper=None,
                 resume=False,
                 server_port=None,
                 fail_fast=False,
                 verbose=True,
                 checkpoint_period=None,
                 trial_executor=None):
        self._search_alg = search_alg or BasicVariantGenerator()
        self._scheduler_alg = scheduler or FIFOScheduler()
        self.trial_executor = trial_executor or RayTrialExecutor()

        # For debugging, it may be useful to halt trials after some time has
        # elapsed. TODO(ekl) consider exposing this in the API.
        self._global_time_limit = float(
            os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf")))
        self._total_time = 0
        self._iteration = 0
        self._has_errored = False
        self._fail_fast = fail_fast
        if isinstance(self._fail_fast, str):
            self._fail_fast = self._fail_fast.upper()
            if self._fail_fast == TrialRunner.RAISE:
                logger.warning(
                    "fail_fast='raise' detected. Be careful when using this "
                    "mode as resources (such as Ray processes, "
                    "file descriptors, and temporary files) may not be "
                    "cleaned up properly. To use "
                    "a safer mode, use fail_fast=True.")
            else:
                raise ValueError("fail_fast must be one of {bool, RAISE}. "
                                 f"Got {self._fail_fast}.")
        self._verbose = verbose

        self._server = None
        self._server_port = server_port
        if server_port is not None:
            self._server = TuneServer(self, self._server_port)

        self._trials = []
        self._cached_trial_decisions = {}
        self._stop_queue = []
        self._should_stop_experiment = False  # used by TuneServer
        self._local_checkpoint_dir = local_checkpoint_dir

        if self._local_checkpoint_dir:
            os.makedirs(self._local_checkpoint_dir, exist_ok=True)

        self._remote_checkpoint_dir = remote_checkpoint_dir
        self._syncer = get_cloud_syncer(local_checkpoint_dir,
                                        remote_checkpoint_dir, sync_to_cloud)
        self._stopper = stopper or NoopStopper()
        self._resumed = False

        if self._validate_resume(resume_type=resume):
            errored_only = False
            if isinstance(resume, str):
                errored_only = resume.upper() == "ERRORED_ONLY"
            try:
                self.resume(run_errored_only=errored_only)
                self._resumed = True
            except Exception as e:
                if self._verbose:
                    logger.error(str(e))
                logger.exception("Runner restore failed.")
                if self._fail_fast:
                    raise
                logger.info("Restarting experiment.")
        else:
            logger.debug("Starting a new experiment.")

        self._start_time = time.time()
        self._last_checkpoint_time = -float("inf")
        if checkpoint_period is None:
            checkpoint_period = env_integer("TUNE_GLOBAL_CHECKPOINT_S", 10)
        self._checkpoint_period = checkpoint_period
        self._session_str = datetime.fromtimestamp(
            self._start_time).strftime("%Y-%m-%d_%H-%M-%S")
        self.checkpoint_file = None
        if self._local_checkpoint_dir:
            self.checkpoint_file = os.path.join(
                self._local_checkpoint_dir,
                TrialRunner.CKPT_FILE_TMPL.format(self._session_str))

示例#23

显示文件

文件： test_ray_trial_executor.py 项目： marload/ray

class RayTrialExecutorTest(unittest.TestCase):
    def setUp(self):
        # Wait up to five seconds for placement groups when starting a trial
        os.environ["TUNE_PLACEMENT_GROUP_WAIT_S"] = "5"
        # Block for results even when placement groups are pending
        os.environ["TUNE_TRIAL_STARTUP_GRACE_PERIOD"] = "0"
        os.environ["TUNE_TRIAL_RESULT_WAIT_TIME_S"] = "99999"

        self.trial_executor = RayTrialExecutor(queue_trials=False)
        ray.init(num_cpus=2, ignore_reinit_error=True)
        _register_all()  # Needed for flaky tests

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects

    def testStartStop(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        running = self.trial_executor.get_running_trials()
        self.assertEqual(1, len(running))
        self.trial_executor.stop_trial(trial)

    def testAsyncSave(self):
        """Tests that saved checkpoint value not immediately set."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        checkpoint = self.trial_executor.save(trial, Checkpoint.PERSISTENT)
        self.assertEqual(checkpoint, trial.saving_to)
        self.assertEqual(trial.checkpoint.value, None)
        self.process_trial_save(trial)
        self.assertEqual(checkpoint, trial.checkpoint)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testSaveRestore(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        self.trial_executor.save(trial, Checkpoint.PERSISTENT)
        self.process_trial_save(trial)
        self.trial_executor.restore(trial)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testPauseResume(self):
        """Tests that pausing works for trials in flight."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testSavePauseResumeErrorRestore(self):
        """Tests that pause checkpoint does not replace restore checkpoint."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        # Save
        checkpoint = self.trial_executor.save(trial, Checkpoint.PERSISTENT)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.assertEqual(checkpoint.storage, Checkpoint.PERSISTENT)
        # Process save result (simulates trial runner)
        self.process_trial_save(trial)
        # Train
        self.trial_executor.continue_training(trial)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        # Pause
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.assertEqual(trial.checkpoint.storage, Checkpoint.MEMORY)
        # Resume
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        # Error
        trial.set_status(Trial.ERROR)
        # Restore
        self.trial_executor.restore(trial)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testStartFailure(self):
        _global_registry.register(TRAINABLE_CLASS, "asdf", None)
        trial = Trial("asdf", resources=Resources(1, 0))
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.ERROR, trial.status)

    def testPauseResume2(self):
        """Tests that pausing works for trials being processed."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.fetch_result(trial)
        checkpoint = self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.start_trial(trial, checkpoint)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def _testPauseUnpause(self, result_buffer_length):
        """Tests that unpausing works for trials being processed."""
        os.environ["TUNE_RESULT_BUFFER_LENGTH"] = f"{result_buffer_length}"
        os.environ["TUNE_RESULT_BUFFER_MIN_TIME_S"] = "1"

        # Need a new trial executor so the ENV vars are parsed again
        self.trial_executor = RayTrialExecutor(queue_trials=False)

        base = max(result_buffer_length, 1)

        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base)
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.unpause_trial(trial)
        self.assertEqual(Trial.PENDING, trial.status)
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        trial.last_result = self.trial_executor.fetch_result(trial)[-1]
        self.assertEqual(trial.last_result.get(TRAINING_ITERATION), base * 2)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testPauseUnpauseNoBuffer(self):
        self._testPauseUnpause(0)

    def testPauseUnpauseTrivialBuffer(self):
        self._testPauseUnpause(1)

    def testPauseUnpauseActualBuffer(self):
        self._testPauseUnpause(8)

    def testNoResetTrial(self):
        """Tests that reset handles NotImplemented properly."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {}, "modified_mock")
        self.assertEqual(exists, False)
        self.assertEqual(Trial.RUNNING, trial.status)

    def testResetTrial(self):
        """Tests that reset works as expected."""
        class B(Trainable):
            def step(self):
                return dict(timesteps_this_iter=1, done=True)

            def reset_config(self, config):
                self.config = config
                return True

        trials = self.generate_trials({
            "run": B,
            "config": {
                "foo": 0
            },
        }, "grid_search")
        trial = trials[0]
        self.trial_executor.start_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {"hi": 1},
                                                 "modified_mock")
        self.assertEqual(exists, True)
        self.assertEqual(trial.config.get("hi"), 1)
        self.assertEqual(trial.experiment_tag, "modified_mock")
        self.assertEqual(Trial.RUNNING, trial.status)

    def testForceTrialCleanup(self):
        class B(Trainable):
            def step(self):
                print("Step start")
                time.sleep(10)
                print("Step done")
                return dict(my_metric=1, timesteps_this_iter=1, done=True)

            def reset_config(self, config):
                self.config = config
                return True

            def cleanup(self):
                print("Cleanup start")
                time.sleep(10)
                print("Cleanup done")

        # First check if the trials terminate gracefully by default
        trials = self.generate_trials({
            "run": B,
            "config": {
                "foo": 0
            },
        }, "grid_search")
        trial = trials[0]
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        time.sleep(5)
        print("Stop trial")
        self.trial_executor.stop_trial(trial)
        print("Start trial cleanup")
        start = time.time()
        self.trial_executor.cleanup([trial])
        self.assertGreaterEqual(time.time() - start, 12.0)

        # Check forceful termination. It should run for much less than the
        # sleep periods in the Trainable
        trials = self.generate_trials({
            "run": B,
            "config": {
                "foo": 0
            },
        }, "grid_search")
        trial = trials[0]
        os.environ["TUNE_FORCE_TRIAL_CLEANUP_S"] = "1"
        self.trial_executor = RayTrialExecutor(queue_trials=False)
        os.environ["TUNE_FORCE_TRIAL_CLEANUP_S"] = "0"
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        time.sleep(5)
        print("Stop trial")
        self.trial_executor.stop_trial(trial)
        print("Start trial cleanup")
        start = time.time()
        self.trial_executor.cleanup([trial])
        self.assertLess(time.time() - start, 5.0)

        # also check if auto-filled metrics were returned
        self.assertIn(PID, trial.last_result)
        self.assertIn(TRIAL_ID, trial.last_result)
        self.assertNotIn("my_metric", trial.last_result)

    @staticmethod
    def generate_trials(spec, name):
        suggester = BasicVariantGenerator()
        suggester.add_configurations({name: spec})
        trials = []
        while not suggester.is_finished():
            trial = suggester.next_trial()
            if trial:
                trials.append(trial)
            else:
                break
        return trials

    def process_trial_save(self, trial):
        """Simulates trial runner save."""
        checkpoint = trial.saving_to
        checkpoint_value = self.trial_executor.fetch_result(trial)[-1]
        checkpoint.value = checkpoint_value
        trial.on_checkpoint(checkpoint)

示例#24

显示文件

    def __init__(self,
                 run_or_experiment,
                 name=None,
                 stop=None,
                 config=None,
                 resources_per_trial=None,
                 num_samples=1,
                 local_dir=None,
                 upload_dir=None,
                 trial_name_creator=None,
                 loggers=None,
                 sync_to_cloud=None,
                 sync_to_driver=False,
                 checkpoint_freq=0,
                 checkpoint_at_end=False,
                 sync_on_checkpoint=True,
                 keep_checkpoints_num=None,
                 checkpoint_score_attr=None,
                 global_checkpoint_period=10,
                 export_formats=None,
                 max_failures=0,
                 fail_fast=True,
                 restore=None,
                 search_alg=None,
                 scheduler=None,
                 with_server=False,
                 server_port=TuneServer.DEFAULT_PORT,
                 verbose=0,
                 progress_reporter=None,
                 resume=False,
                 queue_trials=False,
                 reuse_actors=False,
                 trial_executor=None,
                 raise_on_failed_trial=True,
                 return_trials=False,
                 ray_auto_init=True,
                 shuffle=False):

        if loggers is None:
            loggers = [JsonLogger, CSVLogger]
        config = _transform_config(config)

        is_trainable = False
        try:
            if issubclass(run_or_experiment, Trainable):
                is_trainable = True
        except TypeError:
            pass

        if not is_trainable:
            run_or_experiment = wrap_function(run_or_experiment)

        self.trial_executor = trial_executor or RayTrialExecutor(
            queue_trials=queue_trials,
            reuse_actors=reuse_actors,
            ray_auto_init=ray_auto_init)

        experiments = [run_or_experiment]
        self.logger = logging.getLogger(__name__)

        for i, exp in enumerate(experiments):
            if not isinstance(exp, Experiment):
                run_identifier = Experiment.register_if_needed(exp)
                experiments[i] = Experiment(
                    name=name,
                    run=run_identifier,
                    stop=stop,
                    config=config,
                    resources_per_trial=resources_per_trial,
                    num_samples=num_samples,
                    local_dir=local_dir,
                    upload_dir=upload_dir,
                    sync_to_driver=sync_to_driver,
                    trial_name_creator=trial_name_creator,
                    loggers=loggers,
                    checkpoint_freq=checkpoint_freq,
                    checkpoint_at_end=checkpoint_at_end,
                    sync_on_checkpoint=sync_on_checkpoint,
                    keep_checkpoints_num=keep_checkpoints_num,
                    checkpoint_score_attr=checkpoint_score_attr,
                    export_formats=export_formats,
                    max_failures=max_failures,
                    restore=restore)

        if fail_fast and max_failures != 0:
            raise ValueError("max_failures must be 0 if fail_fast=True.")

        self.runner = TrialRunner(
            search_alg=search_alg or BasicVariantGenerator(shuffle=shuffle),
            scheduler=scheduler or FIFOScheduler(),
            local_checkpoint_dir=experiments[0].checkpoint_dir,
            remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
            sync_to_cloud=sync_to_cloud,
            stopper=experiments[0].stopper,
            checkpoint_period=global_checkpoint_period,
            resume=resume,
            launch_web_server=with_server,
            server_port=server_port,
            verbose=bool(verbose > 1),
            fail_fast=fail_fast,
            trial_executor=self.trial_executor)

        for exp in experiments:
            self.runner.add_experiment(exp)

        self._is_worker_stopped = threading.Event()
        self._worker_exc = None
        self._worker = threading.Thread(target=self.step_worker, daemon=True)
        self._worker.start()

        atexit.register(self.stop)

示例#25

显示文件

文件： tune.py 项目： quantumahesh/Project-Ray

def run(run_or_experiment,
        name=None,
        stop=None,
        config=None,
        resources_per_trial=None,
        num_samples=1,
        local_dir=None,
        upload_dir=None,
        trial_name_creator=None,
        loggers=None,
        sync_to_cloud=None,
        sync_to_driver=None,
        checkpoint_freq=0,
        checkpoint_at_end=False,
        export_formats=None,
        max_failures=3,
        restore=None,
        search_alg=None,
        scheduler=None,
        with_server=False,
        server_port=TuneServer.DEFAULT_PORT,
        verbose=2,
        resume=False,
        queue_trials=False,
        reuse_actors=True,
        trial_executor=None,
        raise_on_failed_trial=True,
        return_trials=True,
        ray_auto_init=True,
        sync_function=None):
    """Executes training.

    Args:
        run_or_experiment (function|class|str|Experiment): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec.
        name (str): Name of experiment.
        stop (dict): The stopping criteria. The keys may be any field in
            the return result of 'train()', whichever is reached first.
            Defaults to empty dict.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict): Machine resources to allocate per trial,
            e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be
            assigned unless you specify them here. Defaults to 1 CPU and 0
            GPUs in ``Trainable.default_resource_request()``.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        upload_dir (str): Optional URI to sync training results
            to (e.g. ``s3://bucket``).
        trial_name_creator (func): Optional function for generating
            the trial string representation.
        loggers (list): List of logger creators to be used with
            each Trial. If None, defaults to ray.tune.logger.DEFAULT_LOGGERS.
            See `ray/tune/logger.py`.
        sync_to_cloud (func|str): Function for syncing the local_dir to and
            from upload_dir. If string, then it must be a string template
            that includes `{source}` and `{target}` for the syncer to run.
            If not provided, the sync command defaults to standard
            S3 or gsutil sync comamnds.
        sync_to_driver (func|str): Function for syncing trial logdir from
            remote node to local. If string, then it must be a string template
            that includes `{source}` and `{target}` for the syncer to run.
            If not provided, defaults to using rsync.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial from its last
            checkpoint at least this many times. Only applies if
            checkpointing is enabled. Setting to -1 will lead to infinite
            recovery retries. Defaults to 3.
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, and HyperBand.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", or bool.
            LOCAL/True restores the checkpoint from the local_checkpoint_dir.
            REMOTE restores the checkpoint from remote_checkpoint_dir.
            PROMPT provides CLI feedback. False forces a new
            experiment. If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        ray_auto_init (bool): Automatically starts a local Ray cluster
            if using a RayTrialExecutor (which is the default) and
            if Ray is not initialized. Defaults to True.
        sync_function: Deprecated. See `sync_to_cloud` and
            `sync_to_driver`.

    Returns:
        List of Trial objects.

    Raises:
        TuneError if any trials failed and `raise_on_failed_trial` is True.

    Examples:
        >>> tune.run(mytrainable, scheduler=PopulationBasedTraining())

        >>> tune.run(mytrainable, num_samples=5, reuse_actors=True)

        >>> tune.run(
                "PG",
                num_samples=5,
                config={
                    "env": "CartPole-v0",
                    "lr": tune.sample_from(lambda _: np.random.rand())
                }
            )
    """
    trial_executor = trial_executor or RayTrialExecutor(
        queue_trials=queue_trials,
        reuse_actors=reuse_actors,
        ray_auto_init=ray_auto_init)
    experiment = run_or_experiment
    if not isinstance(run_or_experiment, Experiment):
        run_identifier = Experiment._register_if_needed(run_or_experiment)
        experiment = Experiment(
            name=name,
            run=run_identifier,
            stop=stop,
            config=config,
            resources_per_trial=resources_per_trial,
            num_samples=num_samples,
            local_dir=local_dir,
            upload_dir=upload_dir,
            sync_to_driver=sync_to_driver,
            trial_name_creator=trial_name_creator,
            loggers=loggers,
            checkpoint_freq=checkpoint_freq,
            checkpoint_at_end=checkpoint_at_end,
            export_formats=export_formats,
            max_failures=max_failures,
            restore=restore,
            sync_function=sync_function)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if sync_to_cloud:
        assert experiment.remote_checkpoint_dir, (
            "Need `upload_dir` if `sync_to_cloud` given.")

    runner = TrialRunner(
        search_alg=search_alg or BasicVariantGenerator(),
        scheduler=scheduler or FIFOScheduler(),
        local_checkpoint_dir=experiment.checkpoint_dir,
        remote_checkpoint_dir=experiment.remote_checkpoint_dir,
        sync_to_cloud=sync_to_cloud,
        resume=resume,
        launch_web_server=with_server,
        server_port=server_port,
        verbose=bool(verbose > 1),
        trial_executor=trial_executor)

    runner.add_experiment(experiment)

    if verbose:
        print(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            if verbose:
                print(runner.debug_string())
            last_debug = time.time()

    if verbose:
        print(runner.debug_string(max_debug=99999))

    wait_for_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    if return_trials:
        return runner.get_trials()
    return ExperimentAnalysis(experiment.checkpoint_dir)

示例#26

显示文件

    def __init__(self,
                 search_alg=None,
                 scheduler=None,
                 launch_web_server=False,
                 local_checkpoint_dir=None,
                 remote_checkpoint_dir=None,
                 sync_to_cloud=None,
                 stopper=None,
                 resume=False,
                 server_port=TuneServer.DEFAULT_PORT,
                 fail_fast=False,
                 verbose=True,
                 checkpoint_period=10,
                 trial_executor=None):
        self._search_alg = search_alg or BasicVariantGenerator()
        self._scheduler_alg = scheduler or FIFOScheduler()
        self.trial_executor = trial_executor or RayTrialExecutor()

        # For debugging, it may be useful to halt trials after some time has
        # elapsed. TODO(ekl) consider exposing this in the API.
        self._global_time_limit = float(
            os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf")))
        self._total_time = 0
        self._iteration = 0
        self._has_errored = False
        self._fail_fast = fail_fast
        self._verbose = verbose

        self._server = None
        self._server_port = server_port
        if launch_web_server:
            self._server = TuneServer(self, self._server_port)

        self._trials = []
        self._cached_trial_decisions = {}
        self._stop_queue = []
        self._should_stop_experiment = False  # used by TuneServer
        self._local_checkpoint_dir = local_checkpoint_dir

        if self._local_checkpoint_dir:
            os.makedirs(self._local_checkpoint_dir, exist_ok=True)

        self._remote_checkpoint_dir = remote_checkpoint_dir
        self._syncer = get_cloud_syncer(local_checkpoint_dir,
                                        remote_checkpoint_dir, sync_to_cloud)
        self._stopper = stopper or NoopStopper()
        self._resumed = False

        if self._validate_resume(resume_type=resume):
            try:
                self.resume()
                logger.info("Resuming trial.")
                self._resumed = True
            except Exception:
                logger.exception(
                    "Runner restore failed. Restarting experiment.")
        else:
            logger.debug("Starting a new experiment.")

        self._start_time = time.time()
        self._last_checkpoint_time = -float("inf")
        self._checkpoint_period = checkpoint_period
        self._session_str = datetime.fromtimestamp(
            self._start_time).strftime("%Y-%m-%d_%H-%M-%S")
        self.checkpoint_file = None
        if self._local_checkpoint_dir:
            self.checkpoint_file = os.path.join(
                self._local_checkpoint_dir,
                TrialRunner.CKPT_FILE_TMPL.format(self._session_str))

示例#27

显示文件

文件： test_trial_runner_3.py 项目： tuyulers5/jav44

    def testPlacementGroupDistributedTraining(self):
        """Run distributed training using placement groups.

        Each trial requests 4 CPUs and starts 4 remote training workers.
        """
        def placement_group_factory():
            head_bundle = {"CPU": 1, "GPU": 0, "custom": 0}
            child_bundle = {"CPU": 1}

            return placement_group(
                [head_bundle, child_bundle, child_bundle, child_bundle])

        @ray.remote
        class TrainingActor:
            def train(self, val):
                time.sleep(1)
                return val

        def train(config):
            base = config["base"]
            actors = [TrainingActor.remote() for _ in range(4)]
            futures = [
                actor.train.remote(base + 2 * i)
                for i, actor in enumerate(actors)
            ]
            results = ray.get(futures)

            end = time.time() - config["start_time"]
            tune.report(avg=np.mean(results), end=end)

        trial_executor = RayTrialExecutor()

        start = time.time()
        out = tune.run(train,
                       config={
                           "start_time": start,
                           "base": tune.grid_search(list(range(0, 100, 10)))
                       },
                       resources_per_trial=placement_group_factory,
                       num_samples=1,
                       trial_executor=trial_executor)

        avgs = sorted(t.last_result["avg"] for t in out.trials)
        self.assertSequenceEqual(avgs, list(range(3, 103, 10)))

        trial_end_times = sorted(t.last_result["end"] for t in out.trials)
        print("Trial end times:", trial_end_times)
        max_diff = trial_end_times[-1] - trial_end_times[0]

        # Not all trials have been run in parallel
        self.assertGreater(max_diff, 5)

        # Some trials should have run in parallel
        # Todo: Re-enable when using buildkite
        # self.assertLess(max_diff, 10)

        # Assert proper cleanup
        pg_manager = trial_executor._pg_manager
        self.assertFalse(pg_manager._in_use_trials)
        self.assertFalse(pg_manager._in_use_pgs)
        self.assertFalse(pg_manager._staging_futures)
        for pgf in pg_manager._staging:
            self.assertFalse(pg_manager._staging[pgf])
        for pgf in pg_manager._ready:
            self.assertFalse(pg_manager._ready[pgf])
        self.assertTrue(pg_manager._latest_staging_start_time)

示例#28

显示文件

class RayTrialExecutorTest(unittest.TestCase):
    def setUp(self):
        self.trial_executor = RayTrialExecutor(queue_trials=False)
        ray.init()
        _register_all()  # Needed for flaky tests

    def tearDown(self):
        ray.shutdown()
        _register_all()  # re-register the evicted objects

    def testStartStop(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        running = self.trial_executor.get_running_trials()
        self.assertEqual(1, len(running))
        self.trial_executor.stop_trial(trial)

    def testSaveRestore(self):
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.save(trial, Checkpoint.DISK)
        self.trial_executor.restore(trial)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testPauseResume(self):
        """Tests that pausing works for trials in flight."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testStartFailure(self):
        _global_registry.register(TRAINABLE_CLASS, "asdf", None)
        trial = Trial("asdf", resources=Resources(1, 0))
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.ERROR, trial.status)

    def testPauseResume2(self):
        """Tests that pausing works for trials being processed."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.fetch_result(trial)
        self.trial_executor.pause_trial(trial)
        self.assertEqual(Trial.PAUSED, trial.status)
        self.trial_executor.start_trial(trial)
        self.assertEqual(Trial.RUNNING, trial.status)
        self.trial_executor.stop_trial(trial)
        self.assertEqual(Trial.TERMINATED, trial.status)

    def testNoResetTrial(self):
        """Tests that reset handles NotImplemented properly."""
        trial = Trial("__fake")
        self.trial_executor.start_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {}, "modified_mock")
        self.assertEqual(exists, False)
        self.assertEqual(Trial.RUNNING, trial.status)

    def testResetTrial(self):
        """Tests that reset works as expected."""
        class B(Trainable):
            def _train(self):
                return dict(timesteps_this_iter=1, done=True)

            def reset_config(self, config):
                self.config = config
                return True

        trials = self.generate_trials({
            "run": B,
            "config": {
                "foo": 0
            },
        }, "grid_search")
        trial = trials[0]
        self.trial_executor.start_trial(trial)
        exists = self.trial_executor.reset_trial(trial, {"hi": 1},
                                                 "modified_mock")
        self.assertEqual(exists, True)
        self.assertEqual(trial.config.get("hi"), 1)
        self.assertEqual(trial.experiment_tag, "modified_mock")
        self.assertEqual(Trial.RUNNING, trial.status)

    def generate_trials(self, spec, name):
        suggester = BasicVariantGenerator()
        suggester.add_configurations({name: spec})
        return suggester.next_trials()

示例#29

显示文件

文件： test_ray_trial_executor.py 项目： miqdigital/ray

 def setUp(self):
     ray.init(local_mode=True)
     self.trial_executor = RayTrialExecutor()

示例#30

显示文件

 def setUp(self):
     self.trial_executor = RayTrialExecutor(queue_trials=False)
     ray.init()
     _register_all()  # Needed for flaky tests

示例#31

显示文件

文件： test_ray_trial_executor.py 项目： miqdigital/ray

 def setUp(self):
     self.trial_executor = RayTrialExecutor()
     ray.init(num_cpus=2, ignore_reinit_error=True)
     _register_all()  # Needed for flaky tests

示例#32

显示文件

def run(
        run_or_experiment: Union[str, Callable, Type],
        name: Optional[str] = None,
        metric: Optional[str] = None,
        mode: Optional[str] = None,
        stop: Union[None, Mapping, Stopper, Callable[[str, Mapping],
                                                     bool]] = None,
        time_budget_s: Union[None, int, float, datetime.timedelta] = None,
        config: Optional[Dict[str, Any]] = None,
        resources_per_trial: Union[None, Mapping[str, Union[
            float, int, Mapping]], PlacementGroupFactory] = None,
        num_samples: int = 1,
        local_dir: Optional[str] = None,
        search_alg: Optional[Union[Searcher, SearchAlgorithm]] = None,
        scheduler: Optional[TrialScheduler] = None,
        keep_checkpoints_num: Optional[int] = None,
        checkpoint_score_attr: Optional[str] = None,
        checkpoint_freq: int = 0,
        checkpoint_at_end: bool = False,
        verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
        progress_reporter: Optional[ProgressReporter] = None,
        log_to_file: bool = False,
        trial_name_creator: Optional[Callable[[Trial], str]] = None,
        trial_dirname_creator: Optional[Callable[[Trial], str]] = None,
        sync_config: Optional[SyncConfig] = None,
        export_formats: Optional[Sequence] = None,
        max_failures: int = 0,
        fail_fast: bool = False,
        restore: Optional[str] = None,
        server_port: Optional[int] = None,
        resume: bool = False,
        queue_trials: bool = False,
        reuse_actors: bool = False,
        trial_executor: Optional[RayTrialExecutor] = None,
        raise_on_failed_trial: bool = True,
        callbacks: Optional[Sequence[Callback]] = None,
        # Deprecated args
        loggers: Optional[Sequence[Type[Logger]]] = None,
        ray_auto_init: Optional = None,
        run_errored_only: Optional = None,
        global_checkpoint_period: Optional = None,
        with_server: Optional = None,
        upload_dir: Optional = None,
        sync_to_cloud: Optional = None,
        sync_to_driver: Optional = None,
        sync_on_checkpoint: Optional = None,
        _remote: bool = None,
) -> ExperimentAnalysis:
    """Executes training.

    When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run
    will gracefully shut down and checkpoint the latest experiment state.
    Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step.

    Examples:

    .. code-block:: python

        # Run 10 trials (each trial is one instance of a Trainable). Tune runs
        # in parallel and automatically determines concurrency.
        tune.run(trainable, num_samples=10)

        # Run 1 trial, stop when trial has reached 10 iterations
        tune.run(my_trainable, stop={"training_iteration": 10})

        # automatically retry failed trials up to 3 times
        tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3)

        # Run 1 trial, search over hyperparameters, stop after 10 iterations.
        space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)}
        tune.run(my_trainable, config=space, stop={"training_iteration": 10})

        # Resumes training if a previous machine crashed
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume=True)

        # Rerun ONLY failed trials after an experiment is finished.
        tune.run(my_trainable, config=space,
                 local_dir=<path/to/dir>, resume="ERRORED_ONLY")

    Args:
        run_or_experiment (function | class | str | :class:`Experiment`): If
            function|class|str, this is the algorithm or model to train.
            This may refer to the name of a built-on algorithm
            (e.g. RLLib's DQN or PPO), a user-defined trainable
            function or class, or the string identifier of a
            trainable function or class registered in the tune registry.
            If Experiment, then Tune will execute training based on
            Experiment.spec. If you want to pass in a Python lambda, you
            will need to first register the function:
            ``tune.register_trainable("lambda_id", lambda x: ...)``. You can
            then use ``tune.run("lambda_id")``.
        metric (str): Metric to optimize. This metric should be reported
            with `tune.report()`. If set, will be passed to the search
            algorithm and scheduler.
        mode (str): Must be one of [min, max]. Determines whether objective is
            minimizing or maximizing the metric attribute. If set, will be
            passed to the search algorithm and scheduler.
        name (str): Name of experiment.
        stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict,
            the keys may be any field in the return result of 'train()',
            whichever is reached first. If function, it must take (trial_id,
            result) as arguments and return a boolean (True if trial should be
            stopped, False otherwise). This can also be a subclass of
            ``ray.tune.Stopper``, which allows users to implement
            custom experiment-wide stopping (i.e., stopping an entire Tune
            run based on some time constraint).
        time_budget_s (int|float|datetime.timedelta): Global time budget in
            seconds after which all trials are stopped. Can also be a
            ``datetime.timedelta`` object.
        config (dict): Algorithm-specific configuration for Tune variant
            generation (e.g. env, hyperparams). Defaults to empty dict.
            Custom search algorithms may ignore this.
        resources_per_trial (dict|PlacementGroupFactory): Machine resources
            to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``.
            Note that GPUs will not be assigned unless you specify them here.
            Defaults to 1 CPU and 0 GPUs in
            ``Trainable.default_resource_request()``. This can also
            be a PlacementGroupFactory object wrapping arguments to create a
            per-trial placement group.
        num_samples (int): Number of times to sample from the
            hyperparameter space. Defaults to 1. If `grid_search` is
            provided as an argument, the grid will be repeated
            `num_samples` of times. If this is -1, (virtually) infinite
            samples are generated until a stopping condition is met.
        local_dir (str): Local dir to save training results to.
            Defaults to ``~/ray_results``.
        search_alg (Searcher|SearchAlgorithm): Search algorithm for
            optimization.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to
            ray.tune.schedulers for more options.
        keep_checkpoints_num (int): Number of checkpoints to keep. A value of
            `None` keeps all checkpoints. Defaults to `None`. If set, need
            to provide `checkpoint_score_attr`.
        checkpoint_score_attr (str): Specifies by which attribute to rank the
            best checkpoint. Default is increasing order. If attribute starts
            with `min-` it will rank attribute in decreasing order, i.e.
            `min-validation_loss`.
        checkpoint_freq (int): How many training iterations between
            checkpoints. A value of 0 (default) disables checkpointing.
            This has no effect when using the Functional Training API.
        checkpoint_at_end (bool): Whether to checkpoint at the end of the
            experiment regardless of the checkpoint_freq. Default is False.
            This has no effect when using the Functional Training API.
        verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode.
            0 = silent, 1 = only status updates, 2 = status and brief trial
            results, 3 = status and detailed trial results. Defaults to 3.
        progress_reporter (ProgressReporter): Progress reporter for reporting
            intermediate experiment progress. Defaults to CLIReporter if
            running in command-line, or JupyterNotebookReporter if running in
            a Jupyter notebook.
        log_to_file (bool|str|Sequence): Log stdout and stderr to files in
            Tune's trial directories. If this is `False` (default), no files
            are written. If `true`, outputs are written to `trialdir/stdout`
            and `trialdir/stderr`, respectively. If this is a single string,
            this is interpreted as a file relative to the trialdir, to which
            both streams are written. If this is a Sequence (e.g. a Tuple),
            it has to have length 2 and the elements indicate the files to
            which stdout and stderr are written, respectively.
        trial_name_creator (Callable[[Trial], str]): Optional function
            for generating the trial string representation.
        trial_dirname_creator (Callable[[Trial], str]): Function
            for generating the trial dirname. This function should take
            in a Trial object and return a string representing the
            name of the directory. The return value cannot be a path.
        sync_config (SyncConfig): Configuration object for syncing. See
            tune.SyncConfig.
        export_formats (list): List of formats that exported at the end of
            the experiment. Default is None.
        max_failures (int): Try to recover a trial at least this many times.
            Ray will recover from the latest checkpoint if present.
            Setting to -1 will lead to infinite recovery retries.
            Setting to 0 will disable retries. Defaults to 0.
        fail_fast (bool | str): Whether to fail upon the first error.
            If fail_fast='raise' provided, Tune will automatically
            raise the exception received by the Trainable. fail_fast='raise'
            can easily leak resources and should be used with caution (it
            is best used with `ray.init(local_mode=True)`).
        restore (str): Path to checkpoint. Only makes sense to set if
            running 1 trial. Defaults to None.
        server_port (int): Port number for launching TuneServer.
        resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY",
            or bool. LOCAL/True restores the checkpoint from the
            local_checkpoint_dir, determined
            by `name` and `local_dir`. REMOTE restores the checkpoint
            from remote_checkpoint_dir. PROMPT provides CLI feedback.
            False forces a new experiment. ERRORED_ONLY resets and reruns
            ERRORED trials upon resume - previous trial artifacts will
            be left untouched.  If resume is set but checkpoint does not exist,
            ValueError will be thrown.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        reuse_actors (bool): Whether to reuse actors between different trials
            when possible. This can drastically speed up experiments that start
            and stop actors often (e.g., PBT in time-multiplexing mode). This
            requires trials to have the same resource requirements.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.
        callbacks (list): List of callbacks that will be called at different
            times in the training loop. Must be instances of the
            ``ray.tune.callback.Callback`` class. If not passed,
            `LoggerCallback` and `SyncerCallback` callbacks are automatically
            added.
        _remote (bool): Whether to run the Tune driver in a remote function.
            This is disabled automatically if a custom trial executor is
            passed in. This is enabled by default in Ray client mode.

    Returns:
        ExperimentAnalysis: Object for experiment analysis.

    Raises:
        TuneError: Any trials failed and `raise_on_failed_trial` is True.
    """

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        return ray.get(
            ray.remote(num_cpus=0)(run).remote(
                run_or_experiment,
                name,
                metric,
                mode,
                stop,
                time_budget_s,
                config,
                resources_per_trial,
                num_samples,
                local_dir,
                search_alg,
                scheduler,
                keep_checkpoints_num,
                checkpoint_score_attr,
                checkpoint_freq,
                checkpoint_at_end,
                verbose,
                progress_reporter,
                log_to_file,
                trial_name_creator,
                trial_dirname_creator,
                sync_config,
                export_formats,
                max_failures,
                fail_fast,
                restore,
                server_port,
                resume,
                queue_trials,
                reuse_actors,
                trial_executor,
                raise_on_failed_trial,
                callbacks,
                # Deprecated args
                loggers,
                ray_auto_init,
                run_errored_only,
                global_checkpoint_period,
                with_server,
                upload_dir,
                sync_to_cloud,
                sync_to_driver,
                sync_on_checkpoint,
                _remote=False))

    all_start = time.time()
    if global_checkpoint_period:
        raise ValueError("global_checkpoint_period is deprecated. Set env var "
                         "'TUNE_GLOBAL_CHECKPOINT_S' instead.")
    if ray_auto_init:
        raise ValueError("ray_auto_init is deprecated. "
                         "Set env var 'TUNE_DISABLE_AUTO_INIT=1' instead or "
                         "call 'ray.init' before calling 'tune.run'.")
    if with_server:
        raise ValueError(
            "with_server is deprecated. It is now enabled by default "
            "if 'server_port' is not None.")
    if sync_on_checkpoint or sync_to_cloud or sync_to_driver or upload_dir:
        raise ValueError(
            "sync_on_checkpoint / sync_to_cloud / sync_to_driver / "
            "upload_dir must now be set via `tune.run("
            "sync_config=SyncConfig(...)`. See `ray.tune.SyncConfig` for "
            "more details.")

    if mode and mode not in ["min", "max"]:
        raise ValueError(
            "The `mode` parameter passed to `tune.run()` has to be one of "
            "['min', 'max']")

    set_verbosity(verbose)

    config = config or {}
    sync_config = sync_config or SyncConfig()
    set_sync_periods(sync_config)

    if num_samples == -1:
        num_samples = sys.maxsize

    trial_executor = trial_executor or RayTrialExecutor(
        reuse_actors=reuse_actors, queue_trials=queue_trials)
    if isinstance(run_or_experiment, list):
        experiments = run_or_experiment
    else:
        experiments = [run_or_experiment]

    for i, exp in enumerate(experiments):
        if not isinstance(exp, Experiment):
            experiments[i] = Experiment(
                name=name,
                run=exp,
                stop=stop,
                time_budget_s=time_budget_s,
                config=config,
                resources_per_trial=resources_per_trial,
                num_samples=num_samples,
                local_dir=local_dir,
                upload_dir=sync_config.upload_dir,
                sync_to_driver=sync_config.sync_to_driver,
                trial_name_creator=trial_name_creator,
                trial_dirname_creator=trial_dirname_creator,
                log_to_file=log_to_file,
                checkpoint_freq=checkpoint_freq,
                checkpoint_at_end=checkpoint_at_end,
                sync_on_checkpoint=sync_config.sync_on_checkpoint,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr,
                export_formats=export_formats,
                max_failures=max_failures,
                restore=restore)
    else:
        logger.debug("Ignoring some parameters passed into tune.run.")

    if sync_config.sync_to_cloud:
        for exp in experiments:
            assert exp.remote_checkpoint_dir, (
                "Need `upload_dir` if `sync_to_cloud` given.")

    if fail_fast and max_failures != 0:
        raise ValueError("max_failures must be 0 if fail_fast=True.")

    if issubclass(type(search_alg), Searcher):
        search_alg = SearchGenerator(search_alg)

    if not search_alg:
        search_alg = BasicVariantGenerator()

    if config and not search_alg.set_search_properties(metric, mode, config):
        if has_unresolved_values(config):
            raise ValueError(
                "You passed a `config` parameter to `tune.run()` with "
                "unresolved parameters, but the search algorithm was already "
                "instantiated with a search space. Make sure that `config` "
                "does not contain any more parameter definitions - include "
                "them in the search algorithm's search space if necessary.")

    scheduler = scheduler or FIFOScheduler()
    if not scheduler.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the scheduler you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your scheduler or from your call to `tune.run()`")

    # Create syncer callbacks
    callbacks = create_default_callbacks(
        callbacks, sync_config, metric=metric, loggers=loggers)

    runner = TrialRunner(
        search_alg=search_alg,
        scheduler=scheduler,
        local_checkpoint_dir=experiments[0].checkpoint_dir,
        remote_checkpoint_dir=experiments[0].remote_checkpoint_dir,
        sync_to_cloud=sync_config.sync_to_cloud,
        stopper=experiments[0].stopper,
        resume=resume,
        server_port=server_port,
        fail_fast=fail_fast,
        trial_executor=trial_executor,
        callbacks=callbacks,
        metric=metric)

    if not runner.resumed:
        for exp in experiments:
            search_alg.add_configurations([exp])
    else:
        logger.info("TrialRunner resumed, ignoring new add_experiment.")

    if progress_reporter is None:
        if IS_NOTEBOOK:
            progress_reporter = JupyterNotebookReporter(
                overwrite=not has_verbosity(Verbosity.V2_TRIAL_NORM))
        else:
            progress_reporter = CLIReporter()

    if not progress_reporter.set_search_properties(metric, mode):
        raise ValueError(
            "You passed a `metric` or `mode` argument to `tune.run()`, but "
            "the reporter you are using was already instantiated with their "
            "own `metric` and `mode` parameters. Either remove the arguments "
            "from your reporter or from your call to `tune.run()`")
    progress_reporter.set_total_samples(search_alg.total_samples)

    # User Warning for GPUs
    if trial_executor.has_gpus():
        if isinstance(resources_per_trial,
                      dict) and "gpu" in resources_per_trial:
            # "gpu" is manually set.
            pass
        elif _check_default_resources_override(experiments[0].run_identifier):
            # "default_resources" is manually overridden.
            pass
        else:
            logger.warning("Tune detects GPUs, but no trials are using GPUs. "
                           "To enable trials to use GPUs, set "
                           "tune.run(resources_per_trial={'gpu': 1}...) "
                           "which allows Tune to expose 1 GPU to each trial. "
                           "You can also override "
                           "`Trainable.default_resource_request` if using the "
                           "Trainable API.")

    original_handler = signal.getsignal(signal.SIGINT)
    state = {signal.SIGINT: False}

    def sigint_handler(sig, frame):
        logger.warning(
            "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. "
            "This will try to checkpoint the experiment state one last time. "
            "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) "
            "to skip. ")
        state[signal.SIGINT] = True
        # Restore original signal handler to react to future SIGINT signals
        signal.signal(signal.SIGINT, original_handler)

    if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")):
        signal.signal(signal.SIGINT, sigint_handler)

    tune_start = time.time()
    while not runner.is_finished() and not state[signal.SIGINT]:
        runner.step()
        if has_verbosity(Verbosity.V1_EXPERIMENT):
            _report_progress(runner, progress_reporter)
    tune_taken = time.time() - tune_start

    try:
        runner.checkpoint(force=True)
    except Exception as e:
        logger.warning(f"Trial Runner checkpointing failed: {str(e)}")

    if has_verbosity(Verbosity.V1_EXPERIMENT):
        _report_progress(runner, progress_reporter, done=True)

    wait_for_sync()
    runner.cleanup_trials()

    incomplete_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            incomplete_trials += [trial]

    if incomplete_trials:
        if raise_on_failed_trial and not state[signal.SIGINT]:
            raise TuneError("Trials did not complete", incomplete_trials)
        else:
            logger.error("Trials did not complete: %s", incomplete_trials)

    all_taken = time.time() - all_start
    if has_verbosity(Verbosity.V1_EXPERIMENT):
        logger.info(f"Total run time: {all_taken:.2f} seconds "
                    f"({tune_taken:.2f} seconds for the tuning loop).")

    if state[signal.SIGINT]:
        logger.warning(
            "Experiment has been interrupted, but the most recent state was "
            "saved. You can continue running this experiment by passing "
            "`resume=True` to `tune.run()`")

    trials = runner.get_trials()
    return ExperimentAnalysis(
        runner.checkpoint_file,
        trials=trials,
        default_metric=metric,
        default_mode=mode)