def setup_class(self): """Init Wrapper with MT10.""" # pylint: disable=import-outside-toplevel from metaworld.benchmarks import MT10 tasks = MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append(MT10.from_task(task)) self.env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, metaworld_mt=True)
def setup_class(self): """Init Wrapper with MT10.""" # pylint: disable=import-outside-toplevel from metaworld.benchmarks import MT10 tasks = MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append(GymEnv(MT10.from_task(task))) self.task_names = tasks self.env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla', env_names=tasks) self.env_no_onehot = MultiEnvWrapper( envs, sample_strategy=round_robin_strategy, mode='del-onehot')
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = TfEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = TfEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def mtppo_metaworld_mt10(ctxt, seed, epochs, batch_size, n_workers, n_tasks): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. n_workers (int): The number of workers the sampler should use. n_tasks (int): Number of tasks to use. Should be a multiple of 10. """ set_seed(seed) mt10 = metaworld.MT10() train_task_sampler = MetaWorldTaskSampler(mt10, 'train', lambda env, _: normalize(env), add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=n_workers) algo = PPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95, center_adv=True, lr_clip_range=0.2) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def _init_multi_env_wrapper(self, env_names, sample_strategy=uniform_random_strategy): """helper function to initialize multi_env_wrapper Args: env_names (list(str)): List of gym.Env names. sample_strategy (func): A sampling strategy. Returns: garage.envs.multi_env_wrapper: Multi env wrapper. """ task_envs = [TfEnv(env_name=name) for name in env_names] return MultiEnvWrapper(task_envs, sample_strategy=sample_strategy)
def mttrpo_metaworld_mt1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def multi_env_ppo(ctxt=None, seed=1): """Train PPO on two Atari environments simultaneously. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: env1 = normalize(GymEnv('Adventure-ram-v4')) env2 = normalize(GymEnv('Alien-ram-v4')) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, )) trainer.setup(algo, env) trainer.train(n_epochs=120, batch_size=2048, plot=False)
def multi_env_ppo(ctxt=None, seed=1): """Train PPO on two Atari environments simultaneously. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = TfEnv(normalize(gym.make('Adventure-ram-v4'))) env2 = TfEnv(normalize(gym.make('Alien-ram-v4'))) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env1 = TfEnv(normalize(gym.make('Adventure-ram-v4'))) env2 = TfEnv(normalize(gym.make('Alien-ram-v4'))) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def te_ppo_mt1_push(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env), add_env_onehot=False) envs = [env_up() for env_up in task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * n_tasks policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with TFTrainer(snapshot_config=ctxt) as trainer: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def te_ppo_mt50(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = MT50.get_train_tasks().all_task_names envs = [ normalize(GymEnv(MT50.from_task(task), max_episode_length=150)) for task in tasks ] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='del-onehot') latent_length = 6 inference_window = 6 batch_size = batch_size_per_task * len(tasks) policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with LocalTFRunner(snapshot_config=ctxt) as runner: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def te_ppo_pointenv(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = TASKS latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * len(TASKS) policy_ent_coeff = 1e-3 encoder_ent_coeff = 1e-3 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = 2.0 policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] with TFTrainer(snapshot_config=ctxt) as trainer: task_envs = [ PointEnv(*t_args, **t_kwargs, max_episode_length=100) for t_args, t_kwargs in zip(task_args, task_kwargs) ] env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, std_output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=0.1, output_nonlinearity=tf.nn.tanh, std_output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=TaskEmbeddingWorker) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
class TestMetaWorldMultiEnvWrapper: """Tests for garage.envs.multi_env_wrapper using Metaworld Envs""" def setup_class(self): """Init Wrapper with MT10.""" # pylint: disable=import-outside-toplevel from metaworld.benchmarks import MT10 tasks = MT10.get_train_tasks().all_task_names envs = [] for task in tasks: envs.append(MT10.from_task(task)) self.env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') self.env_no_onehot = MultiEnvWrapper( envs, sample_strategy=round_robin_strategy, mode='del-onehot') def teardown_class(self): """Close the MTMetaWorldWrapper.""" self.env.close() self.env_no_onehot.close() def test_num_tasks(self): """Assert num tasks returns 10, because MT10 is being tested.""" assert self.env.num_tasks == 10 assert self.env_no_onehot.num_tasks == 10 def test_observation_space(self): assert self.env.observation_space.shape == (9 + self.env.num_tasks, ) assert self.env_no_onehot.observation_space.shape == (9, ) def test_step(self): """Test that env_infos includes extra infos and obs has onehot.""" self.env.reset() self.env_no_onehot.reset() action0 = self.env.spec.action_space.sample() action1 = self.env_no_onehot.spec.action_space.sample() obs0, _, _, info0 = self.env.step(action0) obs1, _, _, info1 = self.env_no_onehot.step(action1) assert info0['task_id'] == self.env.active_task_index assert info1['task_id'] == self.env.active_task_index assert (self.env.active_task_one_hot == obs0[9:]).all() assert obs0.shape[0] == obs1.shape[0] + self.env.num_tasks def test_reset(self): """Test round robin switching of environments during call to reset.""" self.env.reset() active_task_id = self.env.active_task_index for _ in range(self.env.num_tasks): self.env.reset() action = self.env.spec.action_space.sample() _, _, _, info = self.env.step(action) assert not info['task_id'] == active_task_id active_task_id = self.env.active_task_index def test_active_task_name(self): """Assert the task name of the active task corresponds to its metaworld name.""" # noqa: E501 for _ in range(self.env.num_tasks): self.env.reset() assert self.env.active_task_name == self.env.env.all_task_names[0]