Пример #1
0
    def _init(self):
        if self.config["use_pytorch"]:
            from ray.rllib.a3c.a3c_torch_policy import A3CTorchPolicyGraph
            self.policy_cls = A3CTorchPolicyGraph
        else:
            from ray.rllib.a3c.a3c_tf_policy import A3CPolicyGraph
            self.policy_cls = A3CPolicyGraph

        if self.config["use_pytorch"]:
            session_creator = None
        else:
            import tensorflow as tf

            def session_creator():
                return tf.Session(
                    config=tf.ConfigProto(intra_op_parallelism_threads=1,
                                          inter_op_parallelism_threads=1,
                                          gpu_options=tf.GPUOptions(
                                              allow_growth=True)))

        remote_cls = CommonPolicyEvaluator.as_remote(
            num_gpus=1 if self.config["use_gpu_for_workers"] else 0)
        self.local_evaluator = CommonPolicyEvaluator(
            self.env_creator,
            self.config["multiagent"]["policy_graphs"] or self.policy_cls,
            policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
            batch_steps=self.config["batch_size"],
            batch_mode="truncate_episodes",
            tf_session_creator=session_creator,
            env_config=self.config["env_config"],
            model_config=self.config["model"],
            policy_config=self.config,
            num_envs=self.config["num_envs"])
        self.remote_evaluators = [
            remote_cls.remote(
                self.env_creator,
                self.config["multiagent"]["policy_graphs"] or self.policy_cls,
                policy_mapping_fn=(
                    self.config["multiagent"]["policy_mapping_fn"]),
                batch_steps=self.config["batch_size"],
                batch_mode="truncate_episodes",
                sample_async=True,
                tf_session_creator=session_creator,
                env_config=self.config["env_config"],
                model_config=self.config["model"],
                policy_config=self.config,
                num_envs=self.config["num_envs"],
                worker_index=i + 1) for i in range(self.config["num_workers"])
        ]

        self.optimizer = AsyncGradientsOptimizer(self.config["optimizer"],
                                                 self.local_evaluator,
                                                 self.remote_evaluators)
Пример #2
0
 def testMetrics(self):
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
     remote_ev = CommonPolicyEvaluator.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result.episodes_total, 20)
     self.assertEqual(result.episode_reward_mean, 10)
Пример #3
0
 def _testWithOptimizer(self, optimizer_cls):
     n = 3
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     dqn_config = {"gamma": 0.95, "n_step": 3}
     if optimizer_cls == SyncReplayOptimizer:
         # TODO: support replay with non-DQN graphs. Currently this can't
         # happen since the replay buffer doesn't encode extra fields like
         # "advantages" that PG uses.
         policies = {
             "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
             "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
         }
     else:
         policies = {
             "p1": (PGPolicyGraph, obs_space, act_space, {}),
             "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
         }
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
         batch_steps=50)
     if optimizer_cls == AsyncGradientsOptimizer:
         remote_evs = [CommonPolicyEvaluator.as_remote().remote(
             env_creator=lambda _: MultiCartpole(n),
             policy_graph=policies,
             policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
             batch_steps=50)]
     else:
         remote_evs = []
     optimizer = optimizer_cls({}, ev, remote_evs)
     for i in range(200):
         ev.foreach_policy(
             lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
             if isinstance(p, DQNPolicyGraph) else None)
         optimizer.step()
         result = collect_metrics(ev, remote_evs)
         if i % 20 == 0:
             ev.foreach_policy(
                 lambda p, _: p.update_target()
                 if isinstance(p, DQNPolicyGraph) else None)
             print("Iter {}, rew {}".format(i, result.policy_reward_mean))
             print("Total reward", result.episode_reward_mean)
         if result.episode_reward_mean >= 25 * n:
             return
     print(result)
     raise Exception("failed to improve reward")
Пример #4
0
    def _init(self):
        adjusted_batch_size = (self.config["sample_batch_size"] +
                               self.config["n_step"] - 1)
        self.local_evaluator = CommonPolicyEvaluator(
            self.env_creator,
            self.config["multiagent"]["policy_graphs"] or self._policy_graph,
            policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
            batch_steps=adjusted_batch_size,
            batch_mode="truncate_episodes",
            preprocessor_pref="deepmind",
            compress_observations=True,
            env_config=self.config["env_config"],
            model_config=self.config["model"],
            policy_config=self.config,
            num_envs=self.config["num_envs"])
        remote_cls = CommonPolicyEvaluator.as_remote(
            num_cpus=self.config["num_cpus_per_worker"],
            num_gpus=self.config["num_gpus_per_worker"])
        self.remote_evaluators = [
            remote_cls.remote(self.env_creator,
                              self._policy_graph,
                              batch_steps=adjusted_batch_size,
                              batch_mode="truncate_episodes",
                              preprocessor_pref="deepmind",
                              compress_observations=True,
                              env_config=self.config["env_config"],
                              model_config=self.config["model"],
                              policy_config=self.config,
                              num_envs=self.config["num_envs"],
                              worker_index=i + 1)
            for i in range(self.config["num_workers"])
        ]

        self.exploration0 = self._make_exploration_schedule(0)
        self.explorations = [
            self._make_exploration_schedule(i)
            for i in range(self.config["num_workers"])
        ]

        for k in OPTIMIZER_SHARED_CONFIGS:
            if k not in self.config["optimizer_config"]:
                self.config["optimizer_config"][k] = self.config[k]

        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
            self.config["optimizer_config"], self.local_evaluator,
            self.remote_evaluators)

        self.last_target_update_ts = 0
        self.num_target_updates = 0
Пример #5
0
    def _init(self):
        self.policy_cls = get_policy_cls(self.config)

        if self.config["use_pytorch"]:
            session_creator = None
        else:
            import tensorflow as tf

            def session_creator():
                return tf.Session(
                    config=tf.ConfigProto(intra_op_parallelism_threads=1,
                                          inter_op_parallelism_threads=1,
                                          gpu_options=tf.GPUOptions(
                                              allow_growth=True)))

        remote_cls = CommonPolicyEvaluator.as_remote(
            num_gpus=1 if self.config["use_gpu_for_workers"] else 0)
        self.local_evaluator = CommonPolicyEvaluator(
            self.env_creator,
            self.policy_cls,
            batch_steps=self.config["batch_size"],
            batch_mode="truncate_episodes",
            tf_session_creator=session_creator,
            registry=self.registry,
            env_config=self.config["env_config"],
            model_config=self.config["model"],
            policy_config=self.config,
            num_envs=self.config["num_envs"])
        self.remote_evaluators = [
            remote_cls.remote(self.env_creator,
                              self.policy_cls,
                              batch_steps=self.config["batch_size"],
                              batch_mode="truncate_episodes",
                              sample_async=True,
                              tf_session_creator=session_creator,
                              registry=self.registry,
                              env_config=self.config["env_config"],
                              model_config=self.config["model"],
                              policy_config=self.config,
                              num_envs=self.config["num_envs"])
            for i in range(self.config["num_workers"])
        ]

        self.optimizer = AsyncOptimizer(self.config["optimizer"],
                                        self.local_evaluator,
                                        self.remote_evaluators)
Пример #6
0
    def _init(self):
        def session_creator():
            return tf.Session(config=tf.ConfigProto(
                **self.config["tf_session_args"]))

        self.local_evaluator = CommonPolicyEvaluator(
            self.env_creator,
            self._default_policy_graph,
            tf_session_creator=session_creator,
            batch_mode="complete_episodes",
            observation_filter=self.config["observation_filter"],
            env_config=self.config["env_config"],
            model_config=self.config["model"],
            policy_config=self.config)
        RemoteEvaluator = CommonPolicyEvaluator.as_remote(
            num_cpus=self.config["num_cpus_per_worker"],
            num_gpus=self.config["num_gpus_per_worker"])
        self.remote_evaluators = [
            RemoteEvaluator.remote(
                self.env_creator,
                self._default_policy_graph,
                batch_mode="complete_episodes",
                observation_filter=self.config["observation_filter"],
                env_config=self.config["env_config"],
                model_config=self.config["model"],
                policy_config=self.config)
            for _ in range(self.config["num_workers"])
        ]

        self.optimizer = LocalMultiGPUOptimizer(
            {
                "sgd_batch_size": self.config["sgd_batchsize"],
                "sgd_stepsize": self.config["sgd_stepsize"],
                "num_sgd_iter": self.config["num_sgd_iter"],
                "timesteps_per_batch": self.config["timesteps_per_batch"]
            }, self.local_evaluator, self.remote_evaluators)

        # TODO(rliaw): Push into Policy Graph
        with self.local_evaluator.tf_sess.graph.as_default():
            self.saver = tf.train.Saver()