class TestPPO: def setup_method(self): self.env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec) def teardown_method(self): self.env.close() def test_ppo_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, optimizer=torch.optim.Adam, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1, policy_lr=3e-4) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
class TestTRPO: """Test class for TRPO.""" def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() @pytest.mark.mujoco def test_trpo_pendulum(self): """Test TRPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = TRPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, max_path_length=100, discount=0.99, gae_lambda=0.98) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 50
def test_maml_trpo_pendulum(): """Test PPO with Pendulum environment.""" env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) rollouts_per_task = 5 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=5, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5 env.close()
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv( normalize(ML10.get_train_tasks(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 test_task_names = ML10.get_test_tasks().all_task_names test_tasks = [ GarageEnv(normalize(ML10.from_task(task), expected_action_scale=10.)) for task in test_task_names ] test_sampler = EnvPoolSampler(test_tasks) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_path_length=max_path_length, n_test_tasks=len(test_task_names)) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def setup_method(self): self.env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec)
def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec)
def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec)
class TestMAMLVPG: """Test class for MAML-VPG.""" def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 task_sampler = SetTaskSampler(lambda: GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
def maml_ppo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 task_sampler = SetTaskSampler(lambda: GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(ctxt) algo = MAMLPPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def test_maml_trpo_dummy_named_env(): """Test with dummy environment that has env_name.""" env = GarageEnv( normalize(DummyMultiTaskBoxEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) rollouts_per_task = 2 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=2, batch_size=rollouts_per_task * max_path_length)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 task_sampler = SetTaskSampler(lambda: GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
def setup_method(self): """Setup method which is called before every test.""" self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, }
def setup_method(self): self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': policy, 'optimizer': torch.optim.Adam, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, 'policy_lr': 1e-2 }
def torch_sac_half_cheetah(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = GarageEnv(normalize(gym.make('HalfCheetah-v2'))) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() runner.setup(algo=sac, env=env, sampler_cls=LocalSampler) runner.train(n_epochs=1000, batch_size=1000)
def test_ddpg_pendulum(self): """ Test DDPG with Pendulum environment. This environment has a [-3, 3] action_space bound. """ runner = LocalRunner() env = GarageEnv(normalize(gym.make('InvertedPendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, policy_lr=1e-4, qf_lr=1e-3, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=100) assert last_avg_ret > 10 env.close()
def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec) self.algo = MAMLPPO(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
class TestLocalRunner: """Test class for LocalRunner.""" def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.baseline = LinearFeatureBaseline(env_spec=self.env.spec) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() @pytest.mark.mujoco def test_set_plot(self): deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env) runner.train(n_epochs=1, batch_size=100, plot=True) assert isinstance( runner._plotter, Plotter), ('self.plotter in LocalRunner should be set to Plotter.')
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 45 env.close()
def test_sac_inverted_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) runner.setup(sac, env, sampler_cls=LocalSampler) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() ret = runner.train(n_epochs=12, batch_size=200, plot=False) assert ret > 85
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ runner = LocalRunner(snapshot_config) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) policy_optimizer = (torch.optim.Adagrad, {'lr': 1e-4, 'lr_decay': 0.99}) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, discount=0.9, policy_optimizer=policy_optimizer, qf_optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, batch_size=100)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) rollouts_per_task = 40 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=20, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=300, batch_size=rollouts_per_task * max_path_length)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task.""" runner = LocalRunner(snapshot_config) env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) action_noise = OUStrategy(env.spec, sigma=0.2) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, n_train_steps=50, min_buffer_size=int(1e4), exploration_strategy=action_noise, target_update_tau=1e-2, policy_lr=1e-4, qf_lr=1e-3, discount=0.9, optimizer=torch.optim.Adam) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=500, n_epoch_cycles=20, batch_size=100)
class TestVPG: @classmethod def setup_class(cls): deterministic.set_seed(0) def setup_method(self): self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': policy, 'optimizer': torch.optim.Adam, 'baseline': LinearFeatureBaseline(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, 'policy_lr': 1e-2 } def teardown_method(self): self._env.close() def test_vpg_no_entropy(self): """Test VPG with no_entropy.""" self._params['positive_adv'] = True self._params['use_softplus_entropy'] = True algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 def test_vpg_max(self): """Test VPG with maximum entropy.""" self._params['center_adv'] = False self._params['stop_entropy_gradient'] = True self._params['entropy_method'] = 'max' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 def test_vpg_regularized(self): """Test VPG with entropy_regularized.""" self._params['entropy_method'] = 'regularized' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 30 @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG) def test_invalid_entropy_config(self, algo_param, error, msg): self._params.update(algo_param) with pytest.raises(error, match=msg): VPG(**self._params)
def run_task(snapshot_config, *_, env_params, algo='TRPO', algo_params={}, epochs=1000, batch_size=4000, policy_hidden_sizes=(32,32), embed_state=False, model_dir='../models/reacher_limited/train_6', augment_embedded_state=False): """Run task.""" embed_config_file = os.path.join(model_dir, 'config.yaml') ckpt_path = os.path.join(model_dir, 'model_latest.ckpt') with LocalTFRunner(snapshot_config=snapshot_config) as runner: if embed_state: reacher_env = ReacherEmbeddedEnv( embed_config_file, ckpt_path, augment_embedded_state=augment_embedded_state, **env_params) else: reacher_env = ReacherEnv(**env_params) env = GarageEnv(reacher_env) policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=policy_hidden_sizes) # hidden_sizes=(32, 32)) #************** TRPO *************** if algo == 'TRPO': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, **algo_params) # max_path_length=100, # discount=0.99, # max_kl_step=0.01) #**************** PPO ********************* elif algo == 'PPO': baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(32, 32), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, # max_path_length=100, # discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10,), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, **algo_params) #**************** Other? ********************** else: print("ERROR: requested unrecognized algorithm: ", algo) raise NotImplementedError runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=batch_size)
class TestVPG: """Test class for VPG.""" @classmethod def setup_class(cls): """Setup method which is called once before all tests in this class.""" deterministic.set_seed(0) def setup_method(self): """Setup method which is called before every test.""" self._env = GarageEnv(gym.make('InvertedDoublePendulum-v2')) self._runner = LocalRunner(snapshot_config) self._policy = GaussianMLPPolicy(env_spec=self._env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) self._params = { 'env_spec': self._env.spec, 'policy': self._policy, 'value_function': GaussianMLPValueFunction(env_spec=self._env.spec), 'max_path_length': 100, 'discount': 0.99, } def teardown_method(self): """Teardown method which is called after every test.""" self._env.close() @pytest.mark.mujoco def test_vpg_no_entropy(self): """Test VPG with no_entropy.""" self._params['positive_adv'] = True self._params['use_softplus_entropy'] = True algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco def test_vpg_max(self): """Test VPG with maximum entropy.""" self._params['center_adv'] = False self._params['stop_entropy_gradient'] = True self._params['entropy_method'] = 'max' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco def test_vpg_regularized(self): """Test VPG with entropy_regularized.""" self._params['entropy_method'] = 'regularized' algo = VPG(**self._params) self._runner.setup(algo, self._env) last_avg_ret = self._runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0 @pytest.mark.mujoco @pytest.mark.parametrize('algo_param, error, msg', INVALID_ENTROPY_CONFIG) def test_invalid_entropy_config(self, algo_param, error, msg): """Test VPG with invalid entropy config.""" self._params.update(algo_param) with pytest.raises(error, match=msg): VPG(**self._params)
class TestMAML: """Test class for MAML.""" def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32)) self.algo = MAMLPPO(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() @staticmethod def _set_params(v, m): """Set the parameters of a module to a value.""" if isinstance(m, torch.nn.Linear): m.weight.data.fill_(v) m.bias.data.fill_(v) @staticmethod def _test_params(v, m): """Test if all parameters of a module equal to a value.""" if isinstance(m, torch.nn.Linear): assert torch.all(torch.eq(m.weight.data, v)) assert torch.all(torch.eq(m.bias.data, v)) def test_get_exploration_policy(self): """Test if an independent copy of policy is returned.""" self.policy.apply(partial(self._set_params, 0.1)) adapt_policy = self.algo.get_exploration_policy() adapt_policy.apply(partial(self._set_params, 0.2)) # Old policy should remain untouched self.policy.apply(partial(self._test_params, 0.1)) adapt_policy.apply(partial(self._test_params, 0.2)) def test_adapt_policy(self): """Test if policy can adapt to samples.""" worker = WorkerFactory(seed=100, max_path_length=100) sampler = LocalSampler.from_worker_factory(worker, self.policy, self.env) self.policy.apply(partial(self._set_params, 0.1)) adapt_policy = self.algo.get_exploration_policy() trajs = sampler.obtain_samples(0, 100, adapt_policy) self.algo.adapt_policy(adapt_policy, trajs) # Old policy should remain untouched self.policy.apply(partial(self._test_params, 0.1)) # Adapted policy should not be identical to old policy for v1, v2 in zip(adapt_policy.parameters(), self.policy.parameters()): if v1.data.ne(v2.data).sum() > 0: break else: pytest.fail('Parameters of adapted policy should not be ' 'identical to the old policy.')