示例#1
0
 def testMultiplePasses(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(
         local, remotes, {
             "minibatch_buffer_size": 10,
             "num_sgd_iter": 10,
             "sample_batch_size": 10,
             "train_batch_size": 50,
         })
     self._wait_for(optimizer, 1000, 10000)
     self.assertLess(optimizer.stats()["num_steps_sampled"], 5000)
     self.assertGreater(optimizer.stats()["num_steps_trained"], 8000)
示例#2
0
 def testReplay(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(
         local, remotes, {
             "replay_buffer_num_slots": 100,
             "replay_proportion": 10,
             "sample_batch_size": 10,
             "train_batch_size": 10,
         })
     self._wait_for(optimizer, 1000, 1000)
     self.assertLess(optimizer.stats()["num_steps_sampled"], 5000)
     self.assertGreater(optimizer.stats()["num_steps_replayed"], 8000)
     self.assertGreater(optimizer.stats()["num_steps_trained"], 8000)
示例#3
0
文件: impala.py 项目: tobenomad/ray
 def _init(self, config, env_creator):
     for k in OPTIMIZER_SHARED_CONFIGS:
         if k not in config["optimizer"]:
             config["optimizer"][k] = config[k]
     policy_cls = self._get_policy_graph()
     self.local_evaluator = self.make_local_evaluator(
         env_creator, policy_cls)
     self.remote_evaluators = self.make_remote_evaluators(
         env_creator, policy_cls, config["num_workers"])
     self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                            self.remote_evaluators,
                                            config["optimizer"])
     if config["entropy_coeff"] < 0:
         raise DeprecationWarning("entropy_coeff must be >= 0")
示例#4
0
 def testReplayAndMultiplePasses(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(
         local, remotes, {
             "minibatch_buffer_size": 10,
             "num_sgd_iter": 10,
             "replay_buffer_num_slots": 100,
             "replay_proportion": 10,
             "sample_batch_size": 10,
             "train_batch_size": 10,
         })
     self._wait_for(optimizer, 1000, 1000)
     self.assertLess(optimizer.stats()["num_steps_sampled"], 5000)
     self.assertGreater(optimizer.stats()["num_steps_replayed"], 8000)
     self.assertGreater(optimizer.stats()["num_steps_trained"], 40000)
示例#5
0
 def _init(self):
     for k in OPTIMIZER_SHARED_CONFIGS:
         if k not in self.config["optimizer"]:
             self.config["optimizer"][k] = self.config[k]
     if self.config["vtrace"]:
         policy_cls = self._policy_graph
     else:
         policy_cls = A3CPolicyGraph
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, policy_cls)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, policy_cls, self.config["num_workers"])
     self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                            self.remote_evaluators,
                                            self.config["optimizer"])
示例#6
0
 def testReplay(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(
         local, remotes, {
             "replay_buffer_num_slots": 100,
             "replay_proportion": 10,
             "sample_batch_size": 10,
             "train_batch_size": 10,
         })
     self._wait_for(optimizer, 1000, 1000)
     stats = optimizer.stats()
     self.assertLess(stats["num_steps_sampled"], 5000)
     replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"]
     self.assertGreater(replay_ratio, 0.7)
     self.assertLess(stats["num_steps_trained"], stats["num_steps_sampled"])
示例#7
0
 def testReplay(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(
         workers,
         replay_buffer_num_slots=100,
         replay_proportion=10,
         sample_batch_size=10,
         train_batch_size=10,
     )
     self._wait_for(optimizer, 1000, 1000)
     stats = optimizer.stats()
     self.assertLess(stats["num_steps_sampled"], 5000)
     replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"]
     self.assertGreater(replay_ratio, 0.7)
     self.assertLess(stats["num_steps_trained"], stats["num_steps_sampled"])
示例#8
0
 def testMultiTierAggregationBadConf(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     aggregators = TreeAggregator.precreate_aggregators(4)
     optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=4)
     self.assertRaises(ValueError,
                       lambda: optimizer.aggregator.init(aggregators))
示例#9
0
 def testMultiTierAggregation(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     aggregators = TreeAggregator.precreate_aggregators(1)
     optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=1)
     optimizer.aggregator.init(aggregators)
     self._wait_for(optimizer, 1000, 1000)
示例#10
0
 def testMultiGPU(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(local,
                                       remotes,
                                       num_gpus=2,
                                       _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
示例#11
0
 def testMultiGPU(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(local, remotes, {
         "num_gpus": 2,
         "_fake_gpus": True
     })
     self._wait_for(optimizer, 1000, 1000)
示例#12
0
文件: impala.py 项目: tmcsantos/ray
def make_aggregators_and_optimizer(workers, config):
    if config["num_aggregation_workers"] > 0:
        # Create co-located aggregator actors first for placement pref
        aggregators = TreeAggregator.precreate_aggregators(
            config["num_aggregation_workers"])
    else:
        aggregators = None
    workers.add_workers(config["num_workers"])

    optimizer = AsyncSamplesOptimizer(
        workers,
        lr=config["lr"],
        num_gpus=config["num_gpus"],
        rollout_fragment_length=config["rollout_fragment_length"],
        train_batch_size=config["train_batch_size"],
        replay_buffer_num_slots=config["replay_buffer_num_slots"],
        replay_proportion=config["replay_proportion"],
        num_data_loader_buffers=config["num_data_loader_buffers"],
        max_sample_requests_in_flight_per_worker=config[
            "max_sample_requests_in_flight_per_worker"],
        broadcast_interval=config["broadcast_interval"],
        num_sgd_iter=config["num_sgd_iter"],
        minibatch_buffer_size=config["minibatch_buffer_size"],
        num_aggregation_workers=config["num_aggregation_workers"],
        learner_queue_size=config["learner_queue_size"],
        learner_queue_timeout=config["learner_queue_timeout"],
        **config["optimizer"])

    if aggregators:
        # Assign the pre-created aggregators to the optimizer
        optimizer.aggregator.init(aggregators)
    return optimizer
示例#13
0
 def testMultiTierAggregationBadConf(self):
     local, remotes = self._make_evs()
     aggregators = TreeAggregator.precreate_aggregators(4)
     optimizer = AsyncSamplesOptimizer(local, remotes,
                                       {"num_aggregation_workers": 4})
     self.assertRaises(ValueError,
                       lambda: optimizer.aggregator.init(aggregators))
示例#14
0
    def test_replay_and_multiple_passes(self):
        local, remotes = self._make_envs()
        workers = WorkerSet._from_existing(local, remotes)
        optimizer = AsyncSamplesOptimizer(workers,
                                          minibatch_buffer_size=10,
                                          num_sgd_iter=10,
                                          replay_buffer_num_slots=100,
                                          replay_proportion=10,
                                          rollout_fragment_length=10,
                                          train_batch_size=10)
        self._wait_for(optimizer, 1000, 1000)

        stats = optimizer.stats()
        print(stats)
        self.assertLess(stats["num_steps_sampled"], 5000)
        replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"]
        self.assertGreater(replay_ratio, 0.7)
示例#15
0
 def testMultiTierAggregation(self):
     local, remotes = self._make_evs()
     aggregators = TreeAggregator.precreate_aggregators(1)
     optimizer = AsyncSamplesOptimizer(local, remotes, {
         "num_aggregation_workers": 1,
     })
     optimizer.aggregator.init(aggregators)
     self._wait_for(optimizer, 1000, 1000)
示例#16
0
 def testMultiGPUParallelLoad(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers,
                                       num_gpus=2,
                                       num_data_loader_buffers=2,
                                       _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
示例#17
0
 def testMultiGPUParallelLoad(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(local,
                                       remotes,
                                       num_gpus=2,
                                       num_data_loader_buffers=2,
                                       _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
示例#18
0
 def testLearnerQueueTimeout(self):
     local, remotes = self._make_envs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers,
                                       sample_batch_size=1000,
                                       train_batch_size=1000,
                                       learner_queue_timeout=1)
     self.assertRaises(AssertionError,
                       lambda: self._wait_for(optimizer, 1000, 1000))
示例#19
0
class ImpalaAgent(Agent):
    """IMPALA implementation using DeepMind's V-trace."""

    _agent_name = "IMPALA"
    _default_config = DEFAULT_CONFIG
    _policy_graph = VTracePolicyGraph

    @classmethod
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        return Resources(
            cpu=1,
            gpu=cf["num_gpus"] and cf["num_gpus"] * cf["gpu_fraction"] or 0,
            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])

    def _init(self):
        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in self.config["optimizer"]:
                self.config["optimizer"][k] = self.config[k]
        if self.config["vtrace"]:
            policy_cls = self._policy_graph
        else:
            policy_cls = A3CPolicyGraph
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_cls)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, policy_cls, self.config["num_workers"],
            {"num_cpus": 1})
        self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               self.config["optimizer"])

    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        start = time.time()
        self.optimizer.step()
        while time.time() - start < self.config["min_iter_time_s"]:
            self.optimizer.step()
        result = self.optimizer.collect_metrics(
            self.config["collect_metrics_timeout"])
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result
示例#20
0
    def testReplayAndMultiplePasses(self):
        local, remotes = self._make_evs()
        optimizer = AsyncSamplesOptimizer(local,
                                          remotes,
                                          minibatch_buffer_size=10,
                                          num_sgd_iter=10,
                                          replay_buffer_num_slots=100,
                                          replay_proportion=10,
                                          sample_batch_size=10,
                                          train_batch_size=10)
        self._wait_for(optimizer, 1000, 1000)

        stats = optimizer.stats()
        print(stats)
        self.assertLess(stats["num_steps_sampled"], 5000)
        replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"]
        train_ratio = stats["num_steps_sampled"] / stats["num_steps_trained"]
        self.assertGreater(replay_ratio, 0.7)
        self.assertLess(train_ratio, 0.4)
示例#21
0
文件: impala.py 项目: vladfi1/ray
class ImpalaAgent(Agent):
    """IMPALA implementation using DeepMind's V-trace."""

    _agent_name = "IMPALA"
    _default_config = DEFAULT_CONFIG
    _policy_graph = VTracePolicyGraph

    @override(Agent)
    def _init(self):
        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in self.config["optimizer"]:
                self.config["optimizer"][k] = self.config[k]
        policy_cls = self._get_policy_graph()
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_cls)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, policy_cls, self.config["num_workers"])
        self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               self.config["optimizer"])
        if self.config["entropy_coeff"] < 0:
            raise DeprecationWarning("entropy_coeff must be >= 0")

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        start = time.time()
        self.optimizer.step()
        while (time.time() - start < self.config["min_iter_time_s"]
               or self.optimizer.num_steps_sampled == prev_steps):
            self.optimizer.step()
        result = self.optimizer.collect_metrics(
            self.config["collect_metrics_timeout"])
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result

    def _get_policy_graph(self):
        if self.config["vtrace"]:
            policy_cls = self._policy_graph
        else:
            policy_cls = A3CPolicyGraph
        return policy_cls
示例#22
0
 def _init(self):
     for k in OPTIMIZER_SHARED_CONFIGS:
         if k not in self.config["optimizer"]:
             self.config["optimizer"][k] = self.config[k]
     policy_cls = self._get_policy_graph()
     self.local_evaluator = self.make_local_evaluator(
         self.env_creator, policy_cls)
     self.remote_evaluators = self.make_remote_evaluators(
         self.env_creator, policy_cls, self.config["num_workers"])
     self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                            self.remote_evaluators,
                                            self.config["optimizer"])
示例#23
0
class ImpalaAgent(Agent):
    """IMPALA implementation using DeepMind's V-trace."""

    _agent_name = "IMPALA"
    _default_config = DEFAULT_CONFIG
    _policy_graph = VTracePolicyGraph

    @override(Agent)
    def _init(self):
        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in self.config["optimizer"]:
                self.config["optimizer"][k] = self.config[k]
        policy_cls = self._get_policy_graph()
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_cls)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, policy_cls, self.config["num_workers"])
        self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               self.config["optimizer"])

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        start = time.time()
        self.optimizer.step()
        while time.time() - start < self.config["min_iter_time_s"]:
            self.optimizer.step()
        result = self.optimizer.collect_metrics(
            self.config["collect_metrics_timeout"])
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result

    def _get_policy_graph(self):
        if self.config["vtrace"]:
            policy_cls = self._policy_graph
        else:
            policy_cls = A3CPolicyGraph
        return policy_cls
示例#24
0
    def _init(self, config, env_creator):
        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in config["optimizer"]:
                config["optimizer"][k] = config[k]
        policy_cls = self._get_policy_graph()
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_cls)

        if self.config["num_aggregation_workers"] > 0:
            # Create co-located aggregator actors first for placement pref
            aggregators = TreeAggregator.precreate_aggregators(
                self.config["num_aggregation_workers"])

        self.remote_evaluators = self.make_remote_evaluators(
            env_creator, policy_cls, config["num_workers"])
        self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               config["optimizer"])
        if config["entropy_coeff"] < 0:
            raise DeprecationWarning("entropy_coeff must be >= 0")

        if self.config["num_aggregation_workers"] > 0:
            # Assign the pre-created aggregators to the optimizer
            self.optimizer.aggregator.init(aggregators)
class ImpalaTrainer(Trainer):
    """IMPALA implementation using DeepMind's V-trace."""

    _name = "IMPALA"
    _default_config = DEFAULT_CONFIG
    _policy_graph = VTracePolicyGraph

    @override(Trainer)
    def _init(self, config, env_creator):
        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in config["optimizer"]:
                config["optimizer"][k] = config[k]
        policy_cls = self._get_policy_graph()
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_cls)

        if self.config["num_aggregation_workers"] > 0:
            # Create co-located aggregator actors first for placement pref
            aggregators = TreeAggregator.precreate_aggregators(
                self.config["num_aggregation_workers"])

        self.remote_evaluators = self.make_remote_evaluators(
            env_creator, policy_cls, config["num_workers"])
        self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               **config["optimizer"])
        if config["entropy_coeff"] < 0:
            raise DeprecationWarning("entropy_coeff must be >= 0")

        if self.config["num_aggregation_workers"] > 0:
            # Assign the pre-created aggregators to the optimizer
            self.optimizer.aggregator.init(aggregators)

    @classmethod
    @override(Trainable)
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        Trainer._validate_config(cf)
        return Resources(
            cpu=cf["num_cpus_for_driver"],
            gpu=cf["num_gpus"],
            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
            cf["num_aggregation_workers"],
            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])

    @override(Trainer)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        start = time.time()
        self.optimizer.step()
        while (time.time() - start < self.config["min_iter_time_s"]
               or self.optimizer.num_steps_sampled == prev_steps):
            self.optimizer.step()
        result = self.collect_metrics()
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result

    def _get_policy_graph(self):
        if self.config["vtrace"]:
            policy_cls = self._policy_graph
        else:
            policy_cls = A3CPolicyGraph
        return policy_cls

    # MURFETD
    def reset_config(self, new_config):
        config = copy.deepcopy(DEFAULT_CONFIG)
        config.update(new_config)
        self.config = config

        # see LearningRateSchedule.__init__(self, self.config["lr"],self.config["lr_schedule"])
        # in vtrace_policy_graph.py
        # see policy_evaluator.py

        ev = self.optimizer.local_evaluator
        p = ev.policy_map[DEFAULT_POLICY_ID]
        p.lr_schedule = ConstantSchedule(self.config["lr"])
        p.cur_lr.load(self.config["lr"], session=ev.tf_sess)

        return True

    @override(Trainer)
    def _try_recover(self):
        """Try to identify and blacklist any unhealthy workers.

        This method is called after an unexpected remote error is encountered
        from a worker. It issues check requests to all current workers and
        blacklists any that respond with error. If no healthy workers remain,
        an error is raised.
    
        MURFETD: some changes from Ray-0.7.0-dev2
        """

        if not self._has_policy_optimizer():
            raise NotImplementedError(
                "Recovery is not supported for this algorithm")

        logger.info("Health checking all workers...")
        checks = []
        for ev in self.optimizer.remote_evaluators:
            _, obj_id = ev.sample_with_count.remote()
            checks.append(obj_id)

        healthy_evaluators = []
        for i, obj_id in enumerate(checks):
            ev = self.optimizer.remote_evaluators[i]
            try:
                ray.get(obj_id)
                healthy_evaluators.append(ev)
                logger.info("Worker {} looks healthy".format(i + 1))
            except RayError:
                logger.exception("Blacklisting worker {}".format(i + 1))
                try:
                    ev.__ray_terminate__.remote()
                except Exception:
                    logger.exception("Error terminating unhealthy worker")

        if len(healthy_evaluators) < 1:
            raise RuntimeError(
                "Not enough healthy workers remain to continue.")

        # MURFETD (add additional new remote_evaluators)
        #num_new_evaluators = len(checks) - len(healthy_evaluators)

        #new_evaluators = self.make_remote_evaluators(
        #        self.env_creator, self._get_policy_graph(), num_new_evaluators)

        #healthy_evaluators.extend(new_evaluators)

        # MURFETD (keep our remote_evaluator list in sync with the optimizer/aggregator)
        #self.remote_evaluators = healthy_evaluators

        self.optimizer.reset(healthy_evaluators)
示例#26
0
class ImpalaAgent(Agent):
    """IMPALA implementation using DeepMind's V-trace."""

    _agent_name = "IMPALA"
    _default_config = DEFAULT_CONFIG
    _policy_graph = VTracePolicyGraph

    @override(Agent)
    def _init(self, config, env_creator):
        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in config["optimizer"]:
                config["optimizer"][k] = config[k]
        policy_cls = self._get_policy_graph()
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_cls)

        if self.config["num_aggregation_workers"] > 0:
            # Create co-located aggregator actors first for placement pref
            aggregators = TreeAggregator.precreate_aggregators(
                self.config["num_aggregation_workers"])

        self.remote_evaluators = self.make_remote_evaluators(
            env_creator, policy_cls, config["num_workers"])
        self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               config["optimizer"])
        if config["entropy_coeff"] < 0:
            raise DeprecationWarning("entropy_coeff must be >= 0")

        if self.config["num_aggregation_workers"] > 0:
            # Assign the pre-created aggregators to the optimizer
            self.optimizer.aggregator.init(aggregators)

    @classmethod
    @override(Trainable)
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        Agent._validate_config(cf)
        return Resources(
            cpu=cf["num_cpus_for_driver"],
            gpu=cf["num_gpus"],
            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
            cf["num_aggregation_workers"],
            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        start = time.time()
        self.optimizer.step()
        while (time.time() - start < self.config["min_iter_time_s"]
               or self.optimizer.num_steps_sampled == prev_steps):
            self.optimizer.step()
        result = self.collect_metrics()
        result.update(timesteps_this_iter=self.optimizer.num_steps_sampled -
                      prev_steps)
        return result

    def _get_policy_graph(self):
        if self.config["vtrace"]:
            policy_cls = self._policy_graph
        else:
            policy_cls = A3CPolicyGraph
        return policy_cls
示例#27
0
class ImpalaAgent(Agent):
    """IMPALA implementation using DeepMind's V-trace."""

    _agent_name = "IMPALA"
    _default_config = DEFAULT_CONFIG

    @classmethod
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        return Resources(
            cpu=1,
            gpu=cf["gpu"] and 1 or 0,
            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])

    def _init(self):
        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in self.config["optimizer"]:
                self.config["optimizer"][k] = self.config[k]
        if self.config["vtrace"]:
            policy_cls = VTracePolicyGraph
        else:
            policy_cls = A3CPolicyGraph
        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_cls)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, policy_cls, self.config["num_workers"],
            {"num_cpus": 1})
        self.optimizer = AsyncSamplesOptimizer(self.local_evaluator,
                                               self.remote_evaluators,
                                               self.config["optimizer"])

    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
        start = time.time()
        self.optimizer.step()
        while time.time() - start < self.config["min_iter_time_s"]:
            self.optimizer.step()
        FilterManager.synchronize(self.local_evaluator.filters,
                                  self.remote_evaluators)
        result = self.optimizer.collect_metrics()
        result = result._replace(
            timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps)
        return result

    def _stop(self):
        # workaround for https://github.com/ray-project/ray/issues/1516
        for ev in self.remote_evaluators:
            ev.__ray_terminate__.remote()

    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir,
                                       "checkpoint-{}".format(self.iteration))
        agent_state = ray.get(
            [a.save.remote() for a in self.remote_evaluators])
        extra_data = {
            "remote_state": agent_state,
            "local_state": self.local_evaluator.save()
        }
        pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
        return checkpoint_path

    def _restore(self, checkpoint_path):
        extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
        ray.get([
            a.restore.remote(o)
            for a, o in zip(self.remote_evaluators, extra_data["remote_state"])
        ])
        self.local_evaluator.restore(extra_data["local_state"])
示例#28
0
 def testSimple(self):
     local, remotes = self._make_evs()
     optimizer = AsyncSamplesOptimizer(local, remotes)
     self._wait_for(optimizer, 1000, 1000)
示例#29
0
 def testMultiGPU(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers, num_gpus=1, _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
示例#30
0
 def testSimple(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers)
     self._wait_for(optimizer, 1000, 1000)