def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=100), expected_action_scale=10.) task_sampler = SetTaskSampler(lambda: normalize( GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32)) self.algo = MAMLPPO(env=self.env, policy=self.policy, sampler=None, task_sampler=task_sampler, value_function=self.value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def maml_ppo_half_cheetah_dir(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~LocalRunner` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = normalize(GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_episode_length = 100 task_sampler = SetTaskSampler(lambda: normalize( GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.)) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_episode_length=max_episode_length, n_test_tasks=1, n_test_episodes=10) runner = LocalRunner(ctxt) algo = MAMLPPO(env=env, policy=policy, value_function=value_function, max_episode_length=max_episode_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=episodes_per_task * max_episode_length)
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) # @TODO blowing up here... env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 task_sampler = SetTaskSampler(lambda: GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def setup_method(self): super().setup_method() self.max_episode_length = 100 self.meta_batch_size = 10 self.episode_per_task = 4 self.tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=normalize(HalfCheetahDirEnv()))) self.env_spec = RL2Env(env=normalize(HalfCheetahDirEnv())).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec)
def test_maml_trpo_pendulum(): """Test PPO with Pendulum environment.""" env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32)) rollouts_per_task = 5 max_episode_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_episode_length=max_episode_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = runner.train(n_epochs=5, batch_size=rollouts_per_task * max_episode_length) assert last_avg_ret > -5 env.close()
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 task_sampler = SetTaskSampler(lambda: GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
def setup_method(self): super().setup_method() self.meta_batch_size = 10 self.episode_per_task = 4 self.max_episode_length = 100 # Avoid pickling self max_episode_length = 100 self.tasks = task_sampler.SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: RL2Env( normalize(GymEnv(env, max_episode_length=max_episode_length)))) self.env_spec = RL2Env( normalize( GymEnv(HalfCheetahDirEnv(), max_episode_length=max_episode_length))).spec self.policy = GaussianGRUPolicy(env_spec=self.env_spec, hidden_dim=64, state_include_action=False) self.baseline = LinearFeatureBaseline(env_spec=self.env_spec) self.sampler = LocalSampler( agents=self.policy, envs=self.tasks.sample(self.meta_batch_size), max_episode_length=self.env_spec.max_episode_length, is_tf_worker=True, n_workers=self.meta_batch_size, worker_class=RL2Worker)
def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec)
def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32))
def test_maml_trpo_pendulum(): """Test PPO with Pendulum environment.""" episodes_per_task = 5 max_episode_length = 100 env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=max_episode_length), expected_action_scale=10.) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32)) task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv( env, max_episode_length=max_episode_length), expected_action_scale=10.)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) trainer = Trainer(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=task_sampler, value_function=value_function, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=5, batch_size=episodes_per_task * max_episode_length) assert last_avg_ret > -5 env.close()
def setup_method(self): """Setup method which is called before every test.""" self.env = normalize(GymEnv(HalfCheetahDirEnv(), max_episode_length=100), expected_action_scale=10.) self.task_sampler = SetTaskSampler( HalfCheetahDirEnv, wrapper=lambda env, _: normalize(GymEnv(env, max_episode_length=100), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec, hidden_sizes=(32, 32))
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def setup_method(self): """Setup method which is called before every test.""" self.env = GarageEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = LinearFeatureBaseline(env_spec=self.env.spec) self.algo = MAMLPPO(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=100, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ env = GarageEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) baseline = LinearFeatureBaseline(env_spec=env.spec) rollouts_per_task = 20 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLVPG(env=env, policy=policy, baseline=baseline, max_path_length=max_path_length, meta_batch_size=40, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) runner.train(n_epochs=300, batch_size=rollouts_per_task * max_path_length)