def _init(self): if self.config["use_pytorch"]: from ray.rllib.a3c.a3c_torch_policy import A3CTorchPolicyGraph self.policy_cls = A3CTorchPolicyGraph else: from ray.rllib.a3c.a3c_tf_policy import A3CPolicyGraph self.policy_cls = A3CPolicyGraph if self.config["use_pytorch"]: session_creator = None else: import tensorflow as tf def session_creator(): return tf.Session( config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, gpu_options=tf.GPUOptions( allow_growth=True))) remote_cls = CommonPolicyEvaluator.as_remote( num_gpus=1 if self.config["use_gpu_for_workers"] else 0) self.local_evaluator = CommonPolicyEvaluator( self.env_creator, self.config["multiagent"]["policy_graphs"] or self.policy_cls, policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"], batch_steps=self.config["batch_size"], batch_mode="truncate_episodes", tf_session_creator=session_creator, env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config, num_envs=self.config["num_envs"]) self.remote_evaluators = [ remote_cls.remote( self.env_creator, self.config["multiagent"]["policy_graphs"] or self.policy_cls, policy_mapping_fn=( self.config["multiagent"]["policy_mapping_fn"]), batch_steps=self.config["batch_size"], batch_mode="truncate_episodes", sample_async=True, tf_session_creator=session_creator, env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config, num_envs=self.config["num_envs"], worker_index=i + 1) for i in range(self.config["num_workers"]) ] self.optimizer = AsyncGradientsOptimizer(self.config["optimizer"], self.local_evaluator, self.remote_evaluators)
def testMetrics(self): ev = CommonPolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") remote_ev = CommonPolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result.episodes_total, 20) self.assertEqual(result.episode_reward_mean, 10)
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGPolicyGraph, obs_space, act_space, {}), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } ev = CommonPolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: remote_evs = [CommonPolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50)] else: remote_evs = [] optimizer = optimizer_cls({}, ev, remote_evs) for i in range(200): ev.foreach_policy( lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02)) if isinstance(p, DQNPolicyGraph) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: ev.foreach_policy( lambda p, _: p.update_target() if isinstance(p, DQNPolicyGraph) else None) print("Iter {}, rew {}".format(i, result.policy_reward_mean)) print("Total reward", result.episode_reward_mean) if result.episode_reward_mean >= 25 * n: return print(result) raise Exception("failed to improve reward")
def _init(self): adjusted_batch_size = (self.config["sample_batch_size"] + self.config["n_step"] - 1) self.local_evaluator = CommonPolicyEvaluator( self.env_creator, self.config["multiagent"]["policy_graphs"] or self._policy_graph, policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"], batch_steps=adjusted_batch_size, batch_mode="truncate_episodes", preprocessor_pref="deepmind", compress_observations=True, env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config, num_envs=self.config["num_envs"]) remote_cls = CommonPolicyEvaluator.as_remote( num_cpus=self.config["num_cpus_per_worker"], num_gpus=self.config["num_gpus_per_worker"]) self.remote_evaluators = [ remote_cls.remote(self.env_creator, self._policy_graph, batch_steps=adjusted_batch_size, batch_mode="truncate_episodes", preprocessor_pref="deepmind", compress_observations=True, env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config, num_envs=self.config["num_envs"], worker_index=i + 1) for i in range(self.config["num_workers"]) ] self.exploration0 = self._make_exploration_schedule(0) self.explorations = [ self._make_exploration_schedule(i) for i in range(self.config["num_workers"]) ] for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer_config"]: self.config["optimizer_config"][k] = self.config[k] self.optimizer = getattr(optimizers, self.config["optimizer_class"])( self.config["optimizer_config"], self.local_evaluator, self.remote_evaluators) self.last_target_update_ts = 0 self.num_target_updates = 0
def _init(self): self.policy_cls = get_policy_cls(self.config) if self.config["use_pytorch"]: session_creator = None else: import tensorflow as tf def session_creator(): return tf.Session( config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, gpu_options=tf.GPUOptions( allow_growth=True))) remote_cls = CommonPolicyEvaluator.as_remote( num_gpus=1 if self.config["use_gpu_for_workers"] else 0) self.local_evaluator = CommonPolicyEvaluator( self.env_creator, self.policy_cls, batch_steps=self.config["batch_size"], batch_mode="truncate_episodes", tf_session_creator=session_creator, registry=self.registry, env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config, num_envs=self.config["num_envs"]) self.remote_evaluators = [ remote_cls.remote(self.env_creator, self.policy_cls, batch_steps=self.config["batch_size"], batch_mode="truncate_episodes", sample_async=True, tf_session_creator=session_creator, registry=self.registry, env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config, num_envs=self.config["num_envs"]) for i in range(self.config["num_workers"]) ] self.optimizer = AsyncOptimizer(self.config["optimizer"], self.local_evaluator, self.remote_evaluators)
def _init(self): def session_creator(): return tf.Session(config=tf.ConfigProto( **self.config["tf_session_args"])) self.local_evaluator = CommonPolicyEvaluator( self.env_creator, self._default_policy_graph, tf_session_creator=session_creator, batch_mode="complete_episodes", observation_filter=self.config["observation_filter"], env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config) RemoteEvaluator = CommonPolicyEvaluator.as_remote( num_cpus=self.config["num_cpus_per_worker"], num_gpus=self.config["num_gpus_per_worker"]) self.remote_evaluators = [ RemoteEvaluator.remote( self.env_creator, self._default_policy_graph, batch_mode="complete_episodes", observation_filter=self.config["observation_filter"], env_config=self.config["env_config"], model_config=self.config["model"], policy_config=self.config) for _ in range(self.config["num_workers"]) ] self.optimizer = LocalMultiGPUOptimizer( { "sgd_batch_size": self.config["sgd_batchsize"], "sgd_stepsize": self.config["sgd_stepsize"], "num_sgd_iter": self.config["num_sgd_iter"], "timesteps_per_batch": self.config["timesteps_per_batch"] }, self.local_evaluator, self.remote_evaluators) # TODO(rliaw): Push into Policy Graph with self.local_evaluator.tf_sess.graph.as_default(): self.saver = tf.train.Saver()