Exemplo n.º 1
0
    def _update_avail_resources(self, num_retries=5):
        for i in range(num_retries):
            try:
                resources = ray.global_state.cluster_resources()
            except Exception:
                # TODO(rliaw): Remove this when local mode is fixed.
                # https://github.com/ray-project/ray/issues/4147
                logger.debug("Using resources for local machine.")
                resources = ray.services.check_and_update_resources(
                    None, None, None)
            if not resources:
                logger.warning("Cluster resources not detected. Retrying...")
                time.sleep(0.5)

        if not resources or "CPU" not in resources:
            raise TuneError("Cluster resources cannot be detected. "
                            "You can resume this experiment by passing in "
                            "`resume=True` to `run`.")

        resources = resources.copy()
        num_cpus = resources.pop("CPU")
        num_gpus = resources.pop("GPU")
        custom_resources = resources

        self._avail_resources = Resources(
            int(num_cpus), int(num_gpus), custom_resources=custom_resources)
        self._last_resource_refresh = time.time()
        self._resources_initialized = True
Exemplo n.º 2
0
 def default_resource_request(cls, config):
     cf = dict(cls._default_config, **config)
     return Resources(
         cpu=1,
         gpu=len([d for d in cf["devices"] if "gpu" in d.lower()]),
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 3
0
 def default_resource_request(cls, config):
     remote_worker_count = config["num_workers"] - 1
     return Resources(
         cpu=1,
         gpu=int(config["use_gpu"]),
         extra_cpu=int(remote_worker_count),
         extra_gpu=int(int(config["use_gpu"]) * remote_worker_count))
Exemplo n.º 4
0
 def default_resource_request(cls, config):
     cf = dict(cls._default_config, **config)
     return Resources(
         cpu=1,
         gpu=cf["gpu"] and 1 or 0,
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 5
0
 def default_resource_request(cls, config):
     cf = merge_dicts(cls._default_config, config)
     return Resources(
         cpu=1,
         gpu=0,
         extra_cpu=cf["num_workers"],
         extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0)
Exemplo n.º 6
0
    def testCheckpointing(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 1},
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("CartPole-v0", "__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].agent.set_info.remote(1)), 1)

        path = trials[0].checkpoint()
        kwargs["restore_path"] = path

        runner.add_trial(Trial("CartPole-v0", "__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.PENDING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[1].agent.get_info.remote()), 1)
        self.addCleanup(os.remove, path)
Exemplo n.º 7
0
    def testPauseThenResume(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 2},
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("CartPole-v0", "__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].agent.get_info.remote()), None)

        self.assertEqual(ray.get(trials[0].agent.set_info.remote(1)), 1)

        trials[0].pause()
        self.assertEqual(trials[0].status, Trial.PAUSED)

        trials[0].resume()
        self.assertEqual(trials[0].status, Trial.RUNNING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].agent.get_info.remote()), 1)

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
Exemplo n.º 8
0
 def default_resource_request(cls, config):
     cf = merge_dicts(cls._default_config, config)
     return Resources(
         cpu=1,
         gpu=cf["num_gpus"],
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 9
0
    def testResourceScheduler(self):
        ray.init(num_cpus=4, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 1},
            "resources": Resources(cpu=1, gpu=1),
        }
        trials = [
            Trial("CartPole-v0", "__fake", **kwargs),
            Trial("CartPole-v0", "__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.PENDING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.PENDING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.RUNNING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.TERMINATED)
Exemplo n.º 10
0
 def __init__(self, i, config):
     self.trainable_name = "trial_{}".format(i)
     self.config = config
     self.experiment_tag = "tag"
     self.logger_running = False
     self.restored_checkpoint = None
     self.resources = Resources(1, 0)
Exemplo n.º 11
0
    def testFailureRecoveryMaxFailures(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner()
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
            "checkpoint_freq": 1,
            "max_failures": 2,
            "config": {
                "mock_error": True,
                "persistent_error": True,
            },
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[0].num_failures, 1)
        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[0].num_failures, 2)
        runner.step()
        self.assertEqual(trials[0].status, Trial.ERROR)
        self.assertEqual(trials[0].num_failures, 3)
Exemplo n.º 12
0
 def default_resource_request(cls, config):
     cf = dict(cls._default_config, **config)
     return Resources(
         cpu=1 + cf["optimizer_config"]["num_replay_buffer_shards"],
         gpu=cf["gpu"] and 1 or 0,
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 13
0
    def _update_avail_resources(self, num_retries=5):
        for i in range(num_retries):
            try:
                resources = ray.cluster_resources()
            except Exception:
                # TODO(rliaw): Remove this when local mode is fixed.
                # https://github.com/ray-project/ray/issues/4147
                logger.debug("Using resources for local machine.")
                resources = ray.services.check_and_update_resources(
                    None, None, None)
            if not resources:
                logger.warning(
                    "Cluster resources not detected or are 0. Retrying...")
                time.sleep(0.5)

        if not resources:
            # NOTE: This hides the possibility that Ray may be waiting for
            # clients to connect.
            resources.setdefault("CPU", 0)
            resources.setdefault("GPU", 0)
            logger.warning("Cluster resources cannot be detected or are 0. "
                           "You can resume this experiment by passing in "
                           "`resume=True` to `run`.")

        resources = resources.copy()
        num_cpus = resources.pop("CPU", 0)
        num_gpus = resources.pop("GPU", 0)
        custom_resources = resources

        self._avail_resources = Resources(
            int(num_cpus), int(num_gpus), custom_resources=custom_resources)
        self._last_resource_refresh = time.time()
        self._resources_initialized = True
Exemplo n.º 14
0
    def testStepHook(self):
        ray.init(num_cpus=4, num_gpus=2)
        runner = TrialRunner(BasicVariantGenerator())

        def on_step_begin(self):
            self._update_avail_resources()
            cnt = self.pre_step if hasattr(self, 'pre_step') else 0
            setattr(self, 'pre_step', cnt + 1)

        def on_step_end(self):
            cnt = self.pre_step if hasattr(self, 'post_step') else 0
            setattr(self, 'post_step', 1 + cnt)

        import types
        runner.trial_executor.on_step_begin = types.MethodType(
            on_step_begin, runner.trial_executor)
        runner.trial_executor.on_step_end = types.MethodType(
            on_step_end, runner.trial_executor)

        kwargs = {
            "stopping_criterion": {
                "training_iteration": 5
            },
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        runner.step()
        self.assertEqual(runner.trial_executor.pre_step, 1)
        self.assertEqual(runner.trial_executor.post_step, 1)
Exemplo n.º 15
0
    def testMultiStepRun(self):
        ray.init(num_cpus=4, num_gpus=2)
        runner = TrialRunner()
        kwargs = {
            "stopping_criterion": {"training_iteration": 5},
            "resources": Resources(cpu=1, gpu=1),
        }
        trials = [
            Trial("__fake", **kwargs),
            Trial("__fake", **kwargs)]
        for t in trials:
            runner.add_trial(t)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.PENDING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.RUNNING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.RUNNING)

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(trials[1].status, Trial.RUNNING)
Exemplo n.º 16
0
 def default_resource_request(cls, config):
     cf = merge_dicts(cls._default_config, config)
     return Resources(
         cpu=1 + cf["optimizer"]["num_replay_buffer_shards"],
         gpu=cf["gpu"] and cf["gpu_fraction"] or 0,
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 17
0
    def testRestoreMetricsAfterCheckpointing(self):
        ray.init(num_cpus=1, num_gpus=1)
        runner = TrialRunner(BasicVariantGenerator())
        kwargs = {
            "resources": Resources(cpu=1, gpu=1),
        }
        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()
        self.assertEqual(trials[0].status, Trial.RUNNING)
        self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1)
        path = runner.trial_executor.save(trials[0])
        runner.trial_executor.stop_trial(trials[0])
        kwargs["restore_path"] = path

        runner.add_trial(Trial("__fake", **kwargs))
        trials = runner.get_trials()

        runner.step()
        self.assertEqual(trials[0].status, Trial.TERMINATED)
        self.assertEqual(trials[1].status, Trial.RUNNING)
        runner.step()
        self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10)
        self.assertEqual(trials[1].last_result["iterations_since_restore"], 1)
        self.assertGreater(trials[1].last_result["time_since_restore"], 0)
        runner.step()
        self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20)
        self.assertEqual(trials[1].last_result["iterations_since_restore"], 2)
        self.assertGreater(trials[1].last_result["time_since_restore"], 0)
        self.addCleanup(os.remove, path)
Exemplo n.º 18
0
    def default_resource_request(cls, config):
        """Returns the resource requirement for the given configuration.

        This can be overriden by sub-classes to set the correct trial resource
        allocation, so the user does not need to.
        """

        return Resources(cpu=1, gpu=0)
Exemplo n.º 19
0
 def testTrialErrorOnStart(self):
     ray.init()
     _default_registry.register(TRAINABLE_CLASS, "asdf", None)
     trial = Trial("asdf", resources=Resources(1, 0))
     try:
         trial.start()
     except Exception as e:
         self.assertIn("a class", str(e))
Exemplo n.º 20
0
 def testTrialErrorOnStart(self):
     ray.init()
     trial_executor = RayTrialExecutor()
     _global_registry.register(TRAINABLE_CLASS, "asdf", None)
     trial = Trial("asdf", resources=Resources(1, 0))
     try:
         trial_executor.start_trial(trial)
     except Exception as e:
         self.assertIn("a class", str(e))
Exemplo n.º 21
0
 def _update_avail_resources(self):
     clients = ray.global_state.client_table()
     local_schedulers = [
         entry for client in clients.values() for entry in client if
         (entry['ClientType'] == 'local_scheduler' and not entry['Deleted'])
     ]
     num_cpus = sum(ls['NumCPUs'] for ls in local_schedulers)
     num_gpus = sum(ls['NumGPUs'] for ls in local_schedulers)
     self._avail_resources = Resources(int(num_cpus), int(num_gpus))
Exemplo n.º 22
0
 def default_resource_request(cls, config):
     cf = dict(cls._default_config, **config)
     Agent._validate_config(cf)
     return Resources(
         cpu=cf["num_cpus_for_driver"],
         gpu=cf["num_gpus"],
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
         cf["num_aggregation_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 23
0
Arquivo: agent.py Projeto: zhy52/ray
 def default_resource_request(cls, config):
     cf = dict(cls._default_config, **config)
     Agent._validate_config(cf)
     # TODO(ekl): add custom resources here once tune supports them
     return Resources(
         cpu=cf["num_cpus_for_driver"],
         gpu=cf["num_gpus"],
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 24
0
def resources_to_json(resources):
    if resources is None:
        resources = Resources(cpu=1, gpu=0)
    return {
        "cpu": resources.cpu,
        "gpu": resources.gpu,
        "extra_cpu": resources.extra_cpu,
        "extra_gpu": resources.extra_gpu,
    }
Exemplo n.º 25
0
 def default_resource_request(cls, config):
     cf = merge_dicts(cls._default_config, config)
     if cf["use_gpu_for_workers"]:
         num_gpus_per_worker = 1
     else:
         num_gpus_per_worker = 0
     return Resources(cpu=1,
                      gpu=cf["gpu"] and 1 or 0,
                      extra_cpu=cf["num_workers"],
                      extra_gpu=num_gpus_per_worker * cf["num_workers"])
Exemplo n.º 26
0
def json_to_resources(data):
    if type(data) is str:
        data = json.loads(data)
    for k in data:
        if k not in Resources._fields:
            raise TuneError(
                "Unknown resource type {}, must be one of {}".format(
                    k, Resources._fields))
    return Resources(data.get("cpu", 1), data.get("gpu", 0),
                     data.get("driver_cpu_limit"),
                     data.get("driver_gpu_limit"))
Exemplo n.º 27
0
    def __init__(self,
                 search_alg,
                 scheduler=None,
                 launch_web_server=False,
                 server_port=TuneServer.DEFAULT_PORT,
                 verbose=True,
                 queue_trials=False):
        """Initializes a new TrialRunner.

        Args:
            search_alg (SearchAlgorithm): SearchAlgorithm for generating
                Trial objects.
            scheduler (TrialScheduler): Defaults to FIFOScheduler.
            launch_web_server (bool): Flag for starting TuneServer
            server_port (int): Port number for launching TuneServer
            verbose (bool): Flag for verbosity. If False, trial results
                will not be output.
            queue_trials (bool): Whether to queue trials when the cluster does
                not currently have enough resources to launch one. This should
                be set to True when running on an autoscaling cluster to enable
                automatic scale-up.
        """
        self._search_alg = search_alg
        self._scheduler_alg = scheduler or FIFOScheduler()
        self._trials = []
        self._running = {}
        self._avail_resources = Resources(cpu=0, gpu=0)
        self._committed_resources = Resources(cpu=0, gpu=0)
        self._resources_initialized = False

        # For debugging, it may be useful to halt trials after some time has
        # elapsed. TODO(ekl) consider exposing this in the API.
        self._global_time_limit = float(
            os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float('inf')))
        self._total_time = 0
        self._server = None
        if launch_web_server:
            self._server = TuneServer(self, server_port)
        self._stop_queue = []
        self._verbose = verbose
        self._queue_trials = queue_trials
Exemplo n.º 28
0
    def __init__(self,
                 queue_trials=False,
                 reuse_actors=False,
                 refresh_period=RESOURCE_REFRESH_PERIOD):
        super(RayTrialExecutor, self).__init__(queue_trials)
        self._running = {}
        # Since trial resume after paused should not run
        # trial.train.remote(), thus no more new remote object id generated.
        # We use self._paused to store paused trials here.
        self._paused = {}
        self._reuse_actors = reuse_actors
        self._cached_actor = None

        self._avail_resources = Resources(cpu=0, gpu=0)
        self._committed_resources = Resources(cpu=0, gpu=0)
        self._resources_initialized = False
        self._refresh_period = refresh_period
        self._last_resource_refresh = float("-inf")
        self._last_nontrivial_wait = time.time()
        if ray.is_initialized():
            self._update_avail_resources()
Exemplo n.º 29
0
 def default_resource_request(cls, config):
     cf = dict(cls._default_config, **config)
     Agent._validate_config(cf)
     if cf["optimizer_class"] == "AsyncReplayOptimizer":
         extra = cf["optimizer"]["num_replay_buffer_shards"]
     else:
         extra = 0
     return Resources(
         cpu=cf["num_cpus_for_driver"],
         gpu=cf["num_gpus"],
         extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + extra,
         extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
Exemplo n.º 30
0
    def _update_avail_resources(self, num_retries=5):
        for i in range(num_retries):
            resources = ray.global_state.cluster_resources()
            if not resources:
                logger.warning("Cluster resources not detected. Retrying...")
                time.sleep(0.5)

        num_cpus = resources["CPU"]
        num_gpus = resources["GPU"]

        self._avail_resources = Resources(int(num_cpus), int(num_gpus))
        self._resources_initialized = True