def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=100), expected_action_scale=10.) task_sampler = SetTaskSampler(lambda: normalize( GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32)) self.algo = MAMLPPO(env=self.env, policy=self.policy, sampler=None, task_sampler=task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) episodes_per_task = 5 max_episode_length = self.env.spec.max_episode_length trainer = Trainer(snapshot_config) algo = MAMLPPO(env=self.env, policy=self.policy, sampler=self.sampler, task_sampler=self.task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) trainer.setup(algo, self.env) last_avg_ret = trainer.train(n_epochs=10, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5
def load_mamlppo(env_name="MountainCarContinuous-v0"): """Return an instance of the MAML-PPO algorithm.""" env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=[64, 64]) vfunc = GaussianMLPValueFunction(env_spec=env.spec) task_sampler = SetTaskSampler( lambda: GarageEnv(normalize(env, expected_action_scale=10.))) max_path_length = 100 meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) algo = MAMLPPO(env=env, policy=policy, value_function=vfunc, max_path_length=max_path_length, meta_batch_size=20, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) return algo
def maml_ppo_half_cheetah_dir(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) max_episode_length = 100 env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=max_episode_length), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, n_test_tasks=2, n_test_episodes=10) trainer = Trainer(ctxt) algo = MAMLPPO(env=env, policy=policy, task_sampler=task_sampler, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def maml_ppo_half_cheetah_dir(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 task_sampler = SetTaskSampler(lambda: GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(ctxt) algo = MAMLPPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec) self.algo = MAMLPPO(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) rollouts_per_task = 40 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLPPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=20, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=300, batch_size=rollouts_per_task * max_path_length)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLPPO(env=self.env, policy=self.policy, baseline=self.baseline, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
class TestMAML: """Test class for MAML.""" def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32)) self.algo = MAMLPPO(env=self.env, policy=self.policy, value_function=self.value_function, max_episode_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() @staticmethod def _set_params(v, m): """Set the parameters of a module to a value.""" if isinstance(m, torch.nn.Linear): m.weight.data.fill_(v) m.bias.data.fill_(v) @staticmethod def _test_params(v, m): """Test if all parameters of a module equal to a value.""" if isinstance(m, torch.nn.Linear): assert torch.all(torch.eq(m.weight.data, v)) assert torch.all(torch.eq(m.bias.data, v)) def test_get_exploration_policy(self): """Test if an independent copy of policy is returned.""" self.policy.apply(partial(self._set_params, 0.1)) adapt_policy = self.algo.get_exploration_policy() adapt_policy.apply(partial(self._set_params, 0.2)) # Old policy should remain untouched self.policy.apply(partial(self._test_params, 0.1)) adapt_policy.apply(partial(self._test_params, 0.2)) def test_adapt_policy(self): """Test if policy can adapt to samples.""" worker = WorkerFactory(seed=100, max_episode_length=100) sampler = LocalSampler.from_worker_factory(worker, self.policy, self.env) self.policy.apply(partial(self._set_params, 0.1)) adapt_policy = self.algo.get_exploration_policy() eps = sampler.obtain_samples(0, 100, adapt_policy) self.algo.adapt_policy(adapt_policy, eps) # Old policy should remain untouched self.policy.apply(partial(self._test_params, 0.1)) # Adapted policy should not be identical to old policy for v1, v2 in zip(adapt_policy.parameters(), self.policy.parameters()): if v1.data.ne(v2.data).sum() > 0: break else: pytest.fail('Parameters of adapted policy should not be ' 'identical to the old policy.')