def test_ppo_param_order_non_delayed_vs_delayed(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) obs_space = env.observation_space act_space = env.action_space model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps batch = next(iter(model.sampler(batch_size))) ub_utils.set_seed(1) model._train_model(batch) # delayed model2 = ppo_model.PPO(None, observation_space=obs_space, action_space=act_space) model2.setup() ub_utils.set_seed(1) model2._train_model(batch) # check trainable variables order self.assertVariables(model.trainable_variables, model2.trainable_variables) # check optimizer variables order self.assertVariables(model.optimizer.variables(), model2.optimizer.variables())
def test_ppo_setup_non_image_obs(self): envs = [FakeContinuousEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) # no share net, mlp model = ppo_model.PPO(env, mlp_units=[64, 64, 64]) self.assertEqual(3, model.n_envs) self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) # mlp(3) + mlp(3) + policy + value self.assertEqual(6+6+3+2, len(model.trainable_variables)) # share net model = ppo_model.PPO(env, share_net=True, force_mlp=False, mlp_units=[64, 64, 64]) # mlp(3) + mlp(3) + policy + value self.assertEqual(6+3+2, len(model.trainable_variables))
def test_ppo_run(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) obs_shape = env.observation_space.shape act_shape = env.action_space.shape n_samples = 100 model.run(n_samples) buf = model.buffer self.assertEqual(n_samples*n_envs, len(buf)) self.assertTrue(buf.ready_for_sample) self.assertFalse(buf.isfull) # test buffer contents self.assertArrayEqual((n_samples, n_envs, *obs_shape), buf.data['obs'].shape) self.assertArrayEqual((n_samples, n_envs, *act_shape), buf.data['act'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['done'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['rew'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['val'].shape) self.assertArrayEqual((n_samples, n_envs), buf.data['logp'].shape)
def test_ppo_setup_image_obs(self): envs = [FakeImageEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) self.assertEqual(3, model.n_envs) self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) # nature_cnn + nature_cnn + policy + value self.assertEqual(8+8+2+2, len(model.trainable_variables)) # test share net model = ppo_model.PPO(env, share_net=True) # nature_cnn + policy + value self.assertEqual(8+2+2, len(model.trainable_variables)) # test force mlp model = ppo_model.PPO(env, share_net=False, force_mlp=True, mlp_units=[64, 64, 64]) # mlp(3) + mlp(3) + policy + value self.assertEqual(6+6+2+2, len(model.trainable_variables))
def test_ppo_reset_spaces_conflict(self): n_envs = 4 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) envs = [FakeImageEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) with self.assertRaises(RuntimeError): # space conflict model.set_env(env)
def test_ppo_dual_clip_valu_clip(self): n_envs = 4 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env, value_clip=0.1, dual_clip=0.1) n_samples = 10 batch_size = 10 n_subepochs = 4 exp_gradsteps = n_samples * n_envs * 1 / batch_size model.run(n_samples) model.train(batch_size, n_subepochs)
def test_ppo_delayed_setup(self): model = ppo_model.PPO(None) self.assertTrue(model.observation_space is None) self.assertTrue(model.action_space is None) self.assertTrue(model.agent is None) envs = [FakeContinuousEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model.set_env(env) model.setup() self.assertTrue(model.observation_space is not None) self.assertTrue(model.action_space is not None) self.assertEqual(4+4+3+2, len(model.trainable_variables))
def test_ppo_save_load(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 model.run(n_samples) # train for some steps ub_utils.set_seed(2) batch = next(iter(model.sampler(batch_size))) model._train_model(batch) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir # save & load model model.save(save_path) loaded_model = ppo_model.PPO.load(save_path) # check model setup self.assertTrue(loaded_model.agent is not None) self.assertTrue(loaded_model.buffer is not None) self.assertTrue(loaded_model.optimizer is not None) # check if config is correctly restored model_config = model.get_config() loaded_config = loaded_model.get_config() self.assertEqual(set(model_config.keys()), set(loaded_config.keys())) for key in model_config: self.assertEqual(model_config[key], loaded_config[key]) # check if all network variables are correctly restored self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # test optimizers # load optimizer params batches = [batch for batch in model.sampler(batch_size)] ub_utils.set_seed(1) for batch in batches: losses1, kl1 = model._train_step(batch) ub_utils.set_seed(1) for batch in batches: losses2, kl2 = loaded_model._train_step(batch) # check if losses are matched self.assertEqual(set(losses1.keys()), set(losses2.keys())) for key in losses1.keys(): self.assertEqual(losses1[key], losses2[key]) self.assertAllClose(kl1, kl2) # check if vars are same self.assertVariables(model.trainable_variables, loaded_model.trainable_variables) # check if params of the optimizer are same self.assertVariables(model.optimizer.variables(), loaded_model.optimizer.variables())
def test_ppo_train(self): n_envs = 3 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) n_samples = 10 batch_size = 10 n_subepochs = 4 exp_gradsteps = n_samples * n_envs * n_subepochs / batch_size model.run(n_samples) model.train(batch_size, n_subepochs) self.assertEqual(exp_gradsteps, model.num_gradsteps) self.assertEqual(n_subepochs, model.num_subepochs)
def test_ppo_predict_batch(self): envs = [FakeContinuousEnv() for _ in range(3)] env = ub_vec.VecEnv(envs) model = ppo_model.PPO(env) batch_size = 3 obs_space = env.observation_space act_space = env.action_space act_dim = act_space.shape[0] obs = np.asarray([obs_space.sample() for _ in range(batch_size)]) act = model.predict(obs, det=True) self.assertArrayEqual((batch_size, act_dim), act.shape) act = model.predict(obs, det=False) self.assertArrayEqual((batch_size, act_dim), act.shape)
def test_ppo_train_with_target_kl(self): n_envs = 3 target_kl = 0.1 envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(0) ub_utils.set_seed(0) model = ppo_model.PPO(env, target_kl=target_kl) n_samples = 10 batch_size = 10 n_subepochs = 4 exp_gradsteps = (n_samples * n_envs * n_subepochs) // batch_size model.run(n_samples) model.train(batch_size, n_subepochs) self.assertTrue(exp_gradsteps > model.num_gradsteps, model.num_gradsteps) self.assertTrue(n_subepochs > model.num_subepochs, model.num_subepochs)
def test_ppo_learn(self): n_envs = 4 n_steps = 125 n_subepochs = 2 n_epochs = 2 batch_size = 50 total_steps = n_envs * n_steps * n_epochs total_gradsteps = (int((n_envs * n_steps)/batch_size+0.5) * n_subepochs * n_epochs) envs = [FakeContinuousEnv() for _ in range(n_envs)] env = ub_vec.VecEnv(envs) eval_env = FakeContinuousEnv() env.seed(1) ub_utils.set_seed(1) model = ppo_model.PPO( env, batch_size=batch_size, n_steps=n_steps, n_subepochs=n_subepochs, ) with tempfile.TemporaryDirectory() as tempdir: save_path = tempdir model.learn( total_steps, log_interval=1, eval_env=eval_env, eval_interval=1, eval_episodes=1, eval_max_steps=10, save_path=save_path, save_interval=1, tb_logdir=save_path, reset_timesteps=True, verbose=3 ) # test load weights ppo_model.PPO.load(save_path) # test model state self.assertEqual(total_steps, model.num_timesteps) self.assertEqual(n_epochs, model.num_epochs) self.assertEqual(n_subepochs*n_epochs, model.num_subepochs) self.assertEqual(total_gradsteps, model.num_gradsteps) self.assertEqual(1.0, model.progress)
def test_ppo_gae(self): n_envs = 2 gamma = 0.99 lam = 0.95 envs = [FakeImageEnv(max_steps=10) for _ in range(n_envs)] env = ub_vec.VecEnv(envs) env.seed(1) ub_utils.set_seed(1) n_samples = 20 model = ppo_model.PPO(env, gamma=gamma, gae_lambda=lam) model.collect(n_samples) exp_gae = legacy_gae( rew = model.buffer.data['rew'], val = model.buffer.data['val'], done = model.buffer.data['done'], gamma = gamma, lam = lam ) env.seed(1) model.run(n_samples) gae = model.buffer.data['adv'] self.assertAllClose(exp_gae, gae)
def test_ppo_not_vec_env(self): env = FakeContinuousEnv() with self.assertRaises(RuntimeError): ppo_model.PPO(env)