def test_dist_info_sym_include_action(self, obs_dim, action_dim, hidden_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_gru_policy.GaussianGRUModel'), new=SimpleGaussianGRUModel): policy = GaussianGRUPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() dist_sym = policy.dist_info_sym( obs_var=obs_ph, state_info_vars={'prev_action': np.zeros((2, 1) + action_dim)}, name='p2_sym') dist = self.sess.run( dist_sym, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(dist['mean'], np.full((2, 1) + action_dim, 0.5)) assert np.array_equal(dist['log_std'], np.full((2, 1) + action_dim, 0.5))
def test_get_action_state_include_action(self, obs_dim, action_dim, hidden_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[ None, None, env.observation_space.flat_dim + np.prod(action_dim) ], name='obs') policy = GaussianGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.build(obs_var) policy.reset() obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) policy.reset() actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_clone(self): env = GymEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, ))) policy = GaussianGRUPolicy(env_spec=env.spec) policy_clone = policy.clone('GaussianGRUPolicyClone') assert policy_clone.env_spec == policy.env_spec for cloned_param, param in zip(policy_clone.parameters.values(), policy.parameters.values()): assert np.array_equal(cloned_param, param)
def test_gaussian_gru_policy(self): gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env, hidden_dim=1, state_include_action=False) gaussian_gru_policy.reset() obs = self.env.observation_space.high assert gaussian_gru_policy.get_action(obs)
def test_gaussian_gru_policy(self): gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.global_variables_initializer()) gaussian_gru_policy.reset() obs = self.env.observation_space.high assert gaussian_gru_policy.get_action(obs)
def test_get_action_state_include_action(self, mock_normal, obs_dim, action_dim, hidden_dim): mock_normal.return_value = 0.5 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_gru_policy.GaussianGRUModel'), new=SimpleGaussianGRUModel): policy = GaussianGRUPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() expected_action = np.full(action_dim, 0.5 * np.exp(0.5) + 0.5) action, agent_info = policy.get_action(obs) assert env.action_space.contains(action) assert np.allclose(action, expected_action) expected_mean = np.full(action_dim, 0.5) assert np.array_equal(agent_info['mean'], expected_mean) expected_log_std = np.full(action_dim, 0.5) assert np.array_equal(agent_info['log_std'], expected_log_std) expected_prev_action = np.full(action_dim, 0) assert np.array_equal(agent_info['prev_action'], expected_prev_action) policy.reset() actions, agent_infos = policy.get_actions([obs]) for action, mean, log_std, prev_action in zip( actions, agent_infos['mean'], agent_infos['log_std'], agent_infos['prev_action']): assert env.action_space.contains(action) assert np.allclose(action, expected_action) assert np.array_equal(mean, expected_mean) assert np.array_equal(log_std, expected_log_std) assert np.array_equal(prev_action, expected_prev_action)
def rl2_ppo_halfcheetah(ctxt, seed, max_episode_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with HalfCheetah environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: tasks = task_sampler.SetTaskSampler( HalfCheetahVelEnv, wrapper=lambda env, _: RL2Env( GymEnv(env, max_episode_length=max_episode_length))) env_spec = RL2Env( GymEnv(HalfCheetahVelEnv(), max_episode_length=max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, episodes_per_trial=episode_per_task, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def test_is_pickleable(self): env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) with mock.patch(('garage.tf.policies.' 'gaussian_gru_policy.GaussianGRUModel'), new=SimpleGaussianGRUModel): policy = GaussianGRUPolicy(env_spec=env.spec, state_include_action=False) env.reset() obs = env.reset() with tf.compat.v1.variable_scope('GaussianGRUPolicy/GaussianGRUModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.networks['default'].mean, feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) # yapf: disable output2 = sess.run( policy_pickled.model.networks['default'].mean, feed_dict={ policy_pickled.model.input: [[obs.flatten()], [obs.flatten()]] }) assert np.array_equal(output1, output2)
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with TFTrainer(snapshot_config) as trainer: env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(32, 32), ) sampler = LocalSampler( agents=gru_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def setup_method(self): super().setup_method() self.meta_batch_size = 10 self.episode_per_task = 4 self.max_episode_length = 100 # Avoid pickling self max_episode_length = 100 self.tasks = task_sampler.SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: RL2Env( normalize(GymEnv(env, max_episode_length=max_episode_length)))) self.env_spec = RL2Env( normalize( GymEnv(HalfCheetahDirEnv(), max_episode_length=max_episode_length))).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec) self.sampler = LocalSampler( agents=self.policy, envs=self.tasks.sample(self.meta_batch_size), max_episode_length=self.env_spec.max_episode_length, is_tf_worker=True, n_workers=self.meta_batch_size, worker_class=RL2Worker)
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def test_is_pickleable(self): env = GarageEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) policy = GaussianGRUPolicy(env_spec=env.spec, state_include_action=False) env.reset() obs = env.reset() with tf.compat.v1.variable_scope('GaussianGRUPolicy/GaussianGRUModel', reuse=True): param = tf.compat.v1.get_variable( 'dist_params/log_std_param/parameter') # assign it to all one param.load(tf.ones_like(param).eval()) output1 = self.sess.run( [policy.distribution.loc, policy.distribution.stddev()], feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) # yapf: disable output2 = sess.run( [ policy_pickled.distribution.loc, policy_pickled.distribution.stddev() ], feed_dict={ policy_pickled.model.input: [[obs.flatten()], [obs.flatten()]] }) assert np.array_equal(output1, output2)
def rl2_ppo_metaworld_ml1_push(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML1 environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~LocalRunner` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_episode_length = 150 inner_max_episode_length = max_episode_length * episode_per_task tasks = task_sampler.SetTaskSampler( lambda: RL2Env(GymEnv(mwb.ML1.get_train_tasks('push-v1')))) env_spec = RL2Env( GymEnv(mwb.ML1.get_train_tasks('push-v1'), max_episode_length=inner_max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, episodes_per_trial=episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def gaussian_gru_policy(ctxt, env_id, seed): """Create Gaussian GRU Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = GaussianGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=5, batch_size=2048)
def rl2_ppo_halfcheetah(ctxt=None, seed=1): """Train PPO with HalfCheetah environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, pg_loss='surrogate_clip', optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def rl2_trpo_halfcheetah(ctxt, seed, max_episode_length, meta_batch_size, n_epochs, episode_per_task): """Train TRPO with HalfCheetah environment. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. max_episode_length (int): Maximum length of a single episode. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: tasks = task_sampler.SetTaskSampler( HalfCheetahVelEnv, wrapper=lambda env, _: RL2Env( GymEnv(env, max_episode_length=max_episode_length))) env_spec = RL2Env( GymEnv(HalfCheetahVelEnv(), max_episode_length=max_episode_length)).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2TRPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, episodes_per_trial=episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * max_episode_length * meta_batch_size)
def gaussian_gru_policy(ctxt, env_id, seed): """Create Gaussian GRU Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(normalize(gym.make(env_id))) policy = GaussianGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def test_build_state_include_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.reset(do_resets=None) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist concat_obs = np.concatenate([obs.flatten(), np.zeros(action_dim)]) output1 = self.sess.run( [policy.distribution.loc], feed_dict={policy.model.input: [[concat_obs], [concat_obs]]}) output2 = self.sess.run( [dist_sym.loc], feed_dict={state_input: [[concat_obs], [concat_obs]]}) assert np.array_equal(output1, output2)
def setup_method(self): super().setup_method() self.max_episode_length = 100 self.meta_batch_size = 10 self.episode_per_task = 4 self.tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=normalize(HalfCheetahDirEnv()))) self.env_spec = RL2Env(env=normalize(HalfCheetahDirEnv())).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec)
def test_gaussian_gru_policy(self): gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env, hidden_dim=1, state_include_action=False) self.sess.run(tf.compat.v1.global_variables_initializer()) gaussian_gru_policy.build(self.obs_var) gaussian_gru_policy.reset() obs = self.env.observation_space.high assert gaussian_gru_policy.get_action(obs)
def test_is_pickleable(self): env = GymEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) policy = GaussianGRUPolicy(env_spec=env.spec, state_include_action=False) env.reset() obs = env.reset()[0] with tf.compat.v1.variable_scope('GaussianGRUPolicy', reuse=True): param = tf.compat.v1.get_variable( 'dist_params/log_std_param/parameter') # assign it to all one param.load(tf.ones_like(param).eval()) state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist output1 = self.sess.run( [dist_sym.loc, dist_sym.stddev()], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) # yapf: disable state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy_pickled.build(state_input, name='dist_sym').dist output2 = sess.run( [ dist_sym.loc, dist_sym.stddev() ], feed_dict={ state_input: [[obs.flatten()], [obs.flatten()]] }) assert np.array_equal(output1, output2)
def rl2_trpo_halfcheetah(ctxt=None, seed=1): """Train TRPO with HalfCheetah environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=HalfCheetahVelEnv())) env_spec = RL2Env(env=HalfCheetahVelEnv()).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2TRPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def run_task(self, snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}, allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = gym.make(self._env) env = TfEnv(normalize(env)) env.reset() policy = GaussianGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def setup_method(self): super().setup_method() self.env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec) self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), )
def test_get_action_state_include_action(self, obs_dim, action_dim, hidden_dim): env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.reset() obs = env.reset()[0] action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) policy.reset() actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def setup_method(self): super().setup_method() self.env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec) self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, hidden_sizes=(32, 32), )
def test_rl2_ppo_ml10(self): # pylint: disable=import-outside-toplevel from metaworld.benchmarks import ML10 ML_train_envs = [ RL2Env(ML10.from_task(task_name)) for task_name in ML10.get_train_tasks().all_task_names ] tasks = task_sampler.EnvPoolSampler(ML_train_envs) tasks.grow_pool(self.meta_batch_size) env_spec = ML_train_envs[0].spec policy = GaussianGRUPolicy(env_spec=env_spec, hidden_dim=64, state_include_action=False, name='policy') baseline = LinearFeatureBaseline(env_spec=env_spec) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = RL2PPO(rl2_max_path_length=self.max_path_length, meta_batch_size=self.meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=self.max_path_length * self.episode_per_task) runner.setup( algo, self.tasks.sample(self.meta_batch_size), sampler_cls=LocalSampler, n_workers=self.meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=self.episode_per_task)) runner.train(n_epochs=1, batch_size=self.episode_per_task * self.max_path_length * self.meta_batch_size)
def test_policies(self): """Test the policies initialization.""" box_env = TfEnv(DummyBoxEnv()) discrete_env = TfEnv(DummyDiscreteEnv()) categorical_gru_policy = CategoricalGRUPolicy(env_spec=discrete_env, hidden_dim=1) categorical_lstm_policy = CategoricalLSTMPolicy(env_spec=discrete_env, hidden_dim=1) categorical_mlp_policy = CategoricalMLPPolicy(env_spec=discrete_env, hidden_sizes=(1, )) continuous_mlp_policy = ContinuousMLPPolicy(env_spec=box_env, hidden_sizes=(1, )) deterministic_mlp_policy = DeterministicMLPPolicy(env_spec=box_env, hidden_sizes=(1, )) gaussian_gru_policy = GaussianGRUPolicy(env_spec=box_env, hidden_dim=1) gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=box_env, hidden_dim=1) gaussian_mlp_policy = GaussianMLPPolicy(env_spec=box_env, hidden_sizes=(1, ))
def test_get_action_dict_space(self): env = GymEnv(DummyDictEnv(obs_space_type='box', act_space_type='box')) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_dim=4, state_include_action=False) policy.reset(do_resets=None) obs = env.reset()[0] action, _ = policy.get_action(obs) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs, obs]) for action in actions: assert env.action_space.contains(action)
def test_build_state_not_include_action(self, obs_dim, action_dim, hidden_dim): env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset(do_resets=None) obs = env.reset()[0] state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist dist_sym2 = policy.build(state_input, name='dist_sym2').dist output1 = self.sess.run( [dist_sym.loc], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) output2 = self.sess.run( [dist_sym2.loc], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(output1, output2)