def pong_model_free(): """TODO(piotrmilos): Document this.""" hparams = tf.contrib.training.HParams( epochs_num=4, eval_every_epochs=2, num_agents=2, optimization_epochs=3, epoch_length=30, entropy_loss_coef=0.003, learning_rate=8e-05, optimizer="Adam", policy_network=feed_forward_cnn_small_categorical_fun, gae_lambda=0.985, num_eval_agents=2, max_gradients_norm=0.5, gae_gamma=0.985, optimization_batch_size=4, clipping_coef=0.2, value_loss_coef=1, save_models_every_epochs=False, frame_stack_size=4, force_beginning_resets=False, ) env = gym_env.T2TGymEnv("PongNoFrameskip-v4", batch_size=2) env.start_new_epoch(0) hparams.add_hparam("env_fn", make_real_env_fn(env)) eval_env = gym_env.T2TGymEnv("PongNoFrameskip-v4", batch_size=2) eval_env.start_new_epoch(0) hparams.add_hparam("eval_env_fn", make_real_env_fn(eval_env)) return hparams
def test_generating_and_loading_preserves_rollouts(self): env_name = TEST_ENV_NAME from_env = gym_env.T2TGymEnv(env_name, batch_size=1) from_env.start_new_epoch(0, self.out_dir) self.play(from_env, n_steps=20) from_env.generate_data(self.out_dir) to_env = gym_env.T2TGymEnv(env_name, batch_size=1) to_env.start_new_epoch(0, self.out_dir) self.assertEqual(from_env.current_epoch_rollouts(), to_env.current_epoch_rollouts())
def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" if getattr(hparams, "game", None): game_name = gym_env.camel_case_name(hparams.game) env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.batch_size) env.start_new_epoch(0) hparams.add_hparam("env_fn", rl.make_real_env_fn(env)) eval_env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.eval_batch_size) eval_env.start_new_epoch(0) hparams.add_hparam("eval_env_fn", rl.make_real_env_fn(eval_env)) return hparams
def initialize_env_specs(hparams): """Initializes env_specs using T2TGymEnvs.""" if getattr(hparams, "game", None): game_name = gym_env.camel_case_name(hparams.game) env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.num_agents) env.start_new_epoch(0) hparams.add_hparam("environment_spec", rl.standard_atari_env_spec(env)) eval_env = gym_env.T2TGymEnv("{}Deterministic-v4".format(game_name), batch_size=hparams.num_eval_agents) eval_env.start_new_epoch(0) hparams.add_hparam( "environment_eval_spec", rl.standard_atari_env_eval_spec(eval_env)) return hparams
def _define_batch_env(environment_spec, num_agents): """Create environments and apply all desired wrappers.""" with tf.variable_scope("environments"): envs = [environment_spec.env_lambda() for _ in range(num_agents)] env = gym_env.T2TGymEnv(envs) return env
def pong_model_free(): """TODO(piotrmilos): Document this.""" hparams = mfrl_base() hparams.batch_size = 2 hparams.ppo_eval_every_epochs = 2 hparams.ppo_epochs_num = 4 hparams.add_hparam("ppo_optimization_epochs", 3) hparams.add_hparam("ppo_epoch_length", 30) hparams.add_hparam("ppo_learning_rate", 8e-05) hparams.add_hparam("ppo_optimizer", "Adam") hparams.add_hparam("ppo_optimization_batch_size", 4) hparams.add_hparam("ppo_save_models_every_epochs", 1000000) env = gym_env.T2TGymEnv("PongNoFrameskip-v4", batch_size=2) env.start_new_epoch(0) hparams.add_hparam("env_fn", make_real_env_fn(env)) eval_env = gym_env.T2TGymEnv("PongNoFrameskip-v4", batch_size=2) eval_env.start_new_epoch(0) hparams.add_hparam("eval_env_fn", make_real_env_fn(eval_env)) return hparams
def _define_batch_env(environment_spec, num_agents, xvfb=False): """Create environments and apply all desired wrappers.""" with tf.variable_scope("environments"): envs = [ ExternalProcessEnv(environment_spec.env_lambda, xvfb) for _ in range(num_agents) ] env = gym_env.T2TGymEnv(envs) env = py_func_batch_env.PyFuncBatchEnv(env) return env
def init_batch_and_play(self, env_lambda, n_steps=1, **kwargs): raw_envs = [env_lambda(), env_lambda()] env = gym_env.T2TGymEnv(raw_envs, **kwargs) obs = list() rewards = list() obs.append(env.reset()) for _ in range(n_steps): step_obs, step_rewards, dones = env.step(actions=[0, 0]) obs.append(step_obs) rewards.append(step_rewards) for (i, done) in enumerate(dones): if done: env.reset([i]) return env, obs, rewards
def test_generates(self): env = gym_env.T2TGymEnv([TestEnv(), TestEnv()]) env.reset() for _ in range(20): (_, _, dones) = env.step([0, 0]) for (i, done) in enumerate(dones): if done: env.reset([i]) env.generate_data(self.out_dir, tmp_dir=None) filenames = os.listdir(self.out_dir) self.assertTrue(filenames) path = os.path.join(self.out_dir, filenames[0]) records = list(tf.python_io.tf_record_iterator(path)) self.assertTrue(records)
def init_batch_and_play(self, env_name, steps_per_epoch=1, epochs=(0,), generate_data=False, **kwargs): env = gym_env.T2TGymEnv(env_name, batch_size=2, **kwargs) obs = list() rewards = list() num_dones = 0 for epoch in epochs: env.start_new_epoch(epoch, self.out_dir) _, epoch_obs, epoch_rewards, epoch_num_dones = \ self.play(env, steps_per_epoch) if generate_data: env.generate_data(self.out_dir) obs.extend(epoch_obs) rewards.extend(epoch_rewards) num_dones += epoch_num_dones return env, obs, rewards, num_dones
def test_shards_per_epoch(self): def num_ending_with(filenames, suffix): return sum( 1 for filename in filenames if filename.endswith(suffix) ) env = gym_env.T2TGymEnv(TEST_ENV_NAME, batch_size=2) env.start_new_epoch(0, self.out_dir) self.play(env, n_steps=20) env.generate_data(self.out_dir) filenames = os.listdir(self.out_dir) num_shards_per_epoch = len(filenames) self.assertEqual(num_ending_with(filenames, ".0"), num_shards_per_epoch) env.start_new_epoch(1, self.out_dir) self.play(env, n_steps=20) env.generate_data(self.out_dir) filenames = os.listdir(self.out_dir) self.assertEqual(len(filenames), 2 * num_shards_per_epoch) for suffix in (".0", ".1"): self.assertEqual(num_ending_with(filenames, suffix), num_shards_per_epoch)
def init_batch_and_play(self, env_lambda, n_steps=1, **kwargs): raw_envs = [env_lambda(), env_lambda()] env = gym_env.T2TGymEnv(raw_envs, **kwargs) env.start_new_epoch(0) return self.play(env, n_steps)