def test_identical_environments(): def helper(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] np.testing.assert_equal(rand_vec_1, rand_vec_2) def helper_neq(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] assert not (rand_vec_1 == rand_vec_2).all() #testing MT1 mt1_1 = metaworld.MT1('sweep-into-v2', seed=10) mt1_2 = metaworld.MT1('sweep-into-v2', seed=10) helper(mt1_1, mt1_2) #testing ML1 ml1_1 = metaworld.ML1('sweep-into-v2', seed=10) ml1_2 = metaworld.ML1('sweep-into-v2', seed=10) helper(ml1_1, ml1_2) #testing MT10 mt10_1 = metaworld.MT10(seed=10) mt10_2 = metaworld.MT10(seed=10) helper(mt10_1, mt10_2) # testing ML10 ml10_1 = metaworld.ML10(seed=10) ml10_2 = metaworld.ML10(seed=10) helper(ml10_1, ml10_2) #testing ML45 ml45_1 = metaworld.ML45(seed=10) ml45_2 = metaworld.ML45(seed=10) helper(ml45_1, ml45_2) #testing MT50 mt50_1 = metaworld.MT50(seed=10) mt50_2 = metaworld.MT50(seed=10) helper(mt50_1, mt50_2) # test that 2 benchmarks with different seeds have different goals mt50_3 = metaworld.MT50(seed=50) helper_neq(mt50_1, mt50_3)
def test_metaworld_support(pass_env_id_instead_of_env_instance: bool): """ Test using metaworld environments as the dataset of a Setting. NOTE: Uses either a MetaWorldEnv instance as the `dataset`, or the env id. """ import metaworld from metaworld import MetaWorldEnv benchmark = metaworld.ML10() # Construct the benchmark, sampling tasks env_name = "reach-v1" env_type: Type[MetaWorldEnv] = benchmark.train_classes[env_name] env = env_type() training_tasks = [ task for task in benchmark.train_tasks if task.env_name == env_name ] setting = TaskIncrementalRLSetting( dataset=env_name if pass_env_id_instead_of_env_instance else env, train_task_schedule={ i: operator.methodcaller("set_task", task) for i, task in enumerate(training_tasks) }, steps_per_task=1000, transforms=[], ) assert setting.nb_tasks == 50 assert setting.steps_per_task == 1000 assert sorted(setting.train_task_schedule.keys()) == list( range(0, 50_000, 1000)) # TODO: Clear the transforms by default, and add it back if needed? assert setting.train_transforms == [] assert setting.val_transforms == [] assert setting.test_transforms == [] assert setting.observation_space.x == env.observation_space # Only test out the first 3 tasks for now. # TODO: Also try out the valid and test environments. for task_id in range(3): setting.current_task_id = task_id train_env = setting.train_dataloader() assert train_env.observation_space.x == env.observation_space assert train_env.observation_space.task_labels == spaces.Discrete( setting.nb_tasks) n_episodes = 1 for episode in range(n_episodes): obs = train_env.reset() done = False steps = 0 while not done and steps < env.max_path_length: obs, reward, done, info = train_env.step( train_env.action_space.sample()) # train_env.render() steps += 1
def maml_trpo_metaworld_ml10(ctxt, seed, epochs, episodes_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer: to create the :class:`~Snapshotter:. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. episodes_per_task (int): Number of episodes per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) ml10 = metaworld.ML10() tasks = MetaWorldTaskSampler(ml10, 'train') env = tasks.sample(10)[0]() test_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv(ml10, 'test')) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, n_workers=meta_batch_size) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, policy=policy, sampler=sampler, task_sampler=tasks, value_function=value_function, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=episodes_per_task * env.spec.max_episode_length)
def test_metaworld_auto_task_schedule( pass_env_id_instead_of_env_instance: bool): """ Test that when passing just an env id from metaworld and a number of tasks, the task schedule is created automatically. """ import metaworld from metaworld import MetaWorldEnv benchmark = metaworld.ML10() # Construct the benchmark, sampling tasks env_name = "reach-v2" env_type: Type[MetaWorldEnv] = benchmark.train_classes[env_name] env = env_type() # TODO: When not passing a nb_tasks, the number of available tasks for that env # is used. # setting = TaskIncrementalRLSetting( # dataset=env_name if pass_env_id_instead_of_env_instance else env, # steps_per_task=1000, # ) # assert setting.nb_tasks == 50 # assert setting.steps_per_task == 1000 # assert sorted(setting.train_task_schedule.keys()) == list(range(0, 50_000, 1000)) # Test passing a number of tasks: with pytest.warns(RuntimeWarning): setting = TaskIncrementalRLSetting( dataset=env_name if pass_env_id_instead_of_env_instance else env, train_max_steps=2000, nb_tasks=2, test_max_steps=2000, transforms=[], ) assert setting.nb_tasks == 2 assert setting.steps_per_task == 1000 assert sorted(setting.train_task_schedule.keys()) == list( range(0, 2000, 1000)) from sequoia.common.metrics.rl_metrics import EpisodeMetrics method = DummyMethod() with pytest.warns(RuntimeWarning): results: IncrementalRLSetting.Results[EpisodeMetrics] = setting.apply( method)
def test_set_task_task_sampler_ml10(): # Import, construct environments here to avoid using up too much # resources if this test isn't run. # pylint: disable=import-outside-toplevel import metaworld ml10 = metaworld.ML10() tasks = task_sampler.MetaWorldTaskSampler(ml10, 'test') assert tasks.n_tasks == 5 * 50 with pytest.raises(ValueError): tasks.sample(1) updates = tasks.sample(10) envs = [update() for update in updates] for env in envs: env.reset() action = envs[0].action_space.sample() rewards = [env.step(action).reward for env in envs] assert np.var(rewards) > 0 env = envs[0] env.close = unittest.mock.MagicMock(name='env.close') updates[1](env) env.close.assert_not_called() updates[2](env) env.close.assert_called()
def pearl_metaworld_ml10(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, reward_scale=10., use_gpu=False): """Train PEARL with ML10 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ml10 = metaworld.ML10() train_env = MetaWorldSetTaskEnv(ml10, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml10, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) trainer = Trainer(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() trainer.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, n_workers=1, worker_class=PEARLWorker) trainer.train(n_epochs=num_epochs, batch_size=batch_size)
import metaworld import random bench_name = 'ML1' if bench_name == 'ML1': print(metaworld.ML1.ENV_NAMES) bench = metaworld.ML1('pick-place-v1') elif bench_name == 'ML10': bench = metaworld.ML10() training_envs = [] for name, env_cls in bench.train_classes.items(): env = env_cls() task = random.choice( [task for task in bench.train_tasks if task.env_name == name]) env.set_task(task) training_envs.append(env) print(training_envs) for env in training_envs: obs = env.reset() action = env.action_space.sample() obs, reward, done, info = env.step(action)
def rl2_ppo_metaworld_ml10(ctxt, seed, meta_batch_size, n_epochs, episode_per_task): """Train RL2 PPO with ML10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: ml10 = metaworld.ML10() tasks = MetaWorldTaskSampler(ml10, 'train', lambda env, _: RL2Env(env)) test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=MetaWorldSetTaskEnv( ml10, 'test'), wrapper=lambda env, _: RL2Env(env)) meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler) env_updates = tasks.sample(10) env = env_updates[0]() env_spec = env.spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict(batch_size=32, max_optimization_epochs=10), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, meta_evaluator=meta_evaluator, episodes_per_trial=episode_per_task) trainer.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_episodes_per_trial=episode_per_task)) trainer.train(n_epochs=n_epochs, batch_size=episode_per_task * env_spec.max_episode_length * meta_batch_size)
except ImportError: # Create a 'dummy' class so we can safely use MTEnv in the type hints below. # Additionally, isinstance(some_env, MTEnv) will always fail when mtenv isn't # installed, which is good. class MTEnv(gym.Env): pass metaworld_installed = False metaworld_envs = [] try: import metaworld from metaworld import MetaWorldEnv from metaworld.envs.mujoco.mujoco_env import MujocoEnv metaworld_envs = list(metaworld.ML10().train_classes.keys()) metaworld_installed = True except ImportError: # Create a 'dummy' class so we can safely use MetaWorldEnv in the type hints below. # Additionally, isinstance(some_env, MetaWorldEnv) will always fail when metaworld # isn't installed, which is good. class MetaWorldEnv(gym.Env): pass class MujocoEnv(gym.Env): pass @dataclass class IncrementalRLSetting(ContinualRLSetting): """ Continual RL setting the data is divided into 'tasks' with clear boundaries.
def __init__( self, benchmark_name: str, save_memory: bool = False, add_observability: bool = False, ) -> None: """ Init function for environment wrapper. """ # We import here so that we avoid importing metaworld if possible, since it is # dependent on mujoco. import metaworld from metaworld import Task # Set config for each benchmark. if benchmark_name.startswith("MT1_"): env_name = benchmark_name[4:] benchmark = metaworld.MT1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = False elif benchmark_name == "MT10": benchmark = metaworld.MT10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name == "MT50": benchmark = metaworld.MT50() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name.startswith("ML1_train_"): env_name = benchmark_name[10:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_train": benchmark = metaworld.ML10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_train": benchmark = metaworld.ML45() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name.startswith("ML1_test_"): env_name = benchmark_name[9:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.test_classes[env_name]} tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_test": benchmark = metaworld.ML10() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_test": benchmark = metaworld.ML45() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True else: raise NotImplementedError # Construct list of tasks for each environment, adding observability to tasks if # necessary. env_tasks = {} for task in tasks: if add_observability: task_data = dict(pickle.loads(task.data)) task_data["partially_observable"] = False task = Task(env_name=task.env_name, data=pickle.dumps(task_data)) if task.env_name in env_tasks: if resample_tasks: env_tasks[task.env_name].append(task) else: env_tasks[task.env_name] = [task] # Construct list of environment classes or class instances. self.save_memory = save_memory if self.save_memory: self.envs_info = [{ "env_name": env_name, "env_cls": env_cls, "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] else: self.envs_info = [{ "env_name": env_name, "env": env_cls(), "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] self.num_tasks = len(self.envs_info) # Sample environment. self._sample_environment()