def __init__( self, task_name, ): mt1 = metaworld.MT1( task_name) # Construct the benchmark, sampling tasks Env = mt1.train_classes[task_name] env = Env() # Create an environment with task `pick_place` env = RenderEnv(env) super().__init__(env) self.mt1 = metaworld.MT1( task_name) # Construct the benchmark, sampling tasks
def test_identical_environments(): def helper(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] np.testing.assert_equal(rand_vec_1, rand_vec_2) def helper_neq(env, env_2): for i in range(len(env.train_tasks)): rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec'] rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec'] assert not (rand_vec_1 == rand_vec_2).all() #testing MT1 mt1_1 = metaworld.MT1('sweep-into-v2', seed=10) mt1_2 = metaworld.MT1('sweep-into-v2', seed=10) helper(mt1_1, mt1_2) #testing ML1 ml1_1 = metaworld.ML1('sweep-into-v2', seed=10) ml1_2 = metaworld.ML1('sweep-into-v2', seed=10) helper(ml1_1, ml1_2) #testing MT10 mt10_1 = metaworld.MT10(seed=10) mt10_2 = metaworld.MT10(seed=10) helper(mt10_1, mt10_2) # testing ML10 ml10_1 = metaworld.ML10(seed=10) ml10_2 = metaworld.ML10(seed=10) helper(ml10_1, ml10_2) #testing ML45 ml45_1 = metaworld.ML45(seed=10) ml45_2 = metaworld.ML45(seed=10) helper(ml45_1, ml45_2) #testing MT50 mt50_1 = metaworld.MT50(seed=10) mt50_2 = metaworld.MT50(seed=10) helper(mt50_1, mt50_2) # test that 2 benchmarks with different seeds have different goals mt50_3 = metaworld.MT50(seed=50) helper_neq(mt50_1, mt50_3)
def mttrpo_metaworld_mt1_push(ctxt, seed, epochs, batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. batch_size (int): Number of environment steps in one batch. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) sampler = RaySampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = TRPO(env_spec=env.spec, policy=policy, value_function=value_function, sampler=sampler, discount=0.99, gae_lambda=0.95) trainer = Trainer(ctxt) trainer.setup(algo, env) trainer.train(n_epochs=epochs, batch_size=batch_size)
def sparse_mlp_mtsac_metaworld_mt1_pick_place(ctxt=None, *, seed, timesteps, _gpu): """Train MTSAC with the MT1 pick-place-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) mt1 = metaworld.MT1('pick-place-v1') mt1_test = metaworld.MT1('pick-place-v1') train_task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env)) test_task_sampler = MetaWorldTaskSampler(mt1_test, 'train', lambda env, _: normalize(env)) n_tasks = 50 train_envs = train_task_sampler.sample(n_tasks) env = train_envs[0]() test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] trainer = Trainer(ctxt) policy = TanhGaussianSparseMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], linear_activity_percent_on=(0.1, 0.1, 0.1), linear_weight_percent_on=(0.4, 0.4, 0.4), mean_nonlinearity=None, std_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousSparseMLPQFunction( env_spec=env.spec, hidden_sizes=(400, 400, 400), linear_activity_percent_on=(0.1, 0.1, 0.1,), linear_weight_percent_on=(0.4, 0.4, 0.4,), ) qf2 = ContinuousSparseMLPQFunction( env_spec=env.spec, hidden_sizes=(400, 400, 400), linear_activity_percent_on=(0.1, 0.1, 0.1), linear_weight_percent_on=(0.4, 0.4, 0.4), ) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) sampler = LocalSampler(agents=policy, envs=train_envs, max_episode_length=env.spec.max_episode_length, n_workers=n_tasks, worker_class=FragmentWorker) batch_size = int(env.spec.max_episode_length * n_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = SparseWeightsMTSAC( policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=150, eval_env=test_envs, env_spec=env.spec, num_tasks=1, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=train_envs) trainer.train(n_epochs=epochs, batch_size=batch_size)
def te_ppo_mt1_push(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) n_tasks = 50 mt1 = metaworld.MT1('push-v1') task_sampler = MetaWorldTaskSampler(mt1, 'train', lambda env, _: normalize(env), add_env_onehot=False) envs = [env_up() for env_up in task_sampler.sample(n_tasks)] env = MultiEnvWrapper(envs, sample_strategy=round_robin_strategy, mode='vanilla') latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * n_tasks policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with TFTrainer(snapshot_config=ctxt) as trainer: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) trainer.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
def tasks(): while True: mt1 = metaworld.MT1( task_name) # Construct the benchmark, sampling tasks yield from mt1.train_tasks
def __init__( self, benchmark_name: str, save_memory: bool = False, add_observability: bool = False, ) -> None: """ Init function for environment wrapper. """ # We import here so that we avoid importing metaworld if possible, since it is # dependent on mujoco. import metaworld from metaworld import Task # Set config for each benchmark. if benchmark_name.startswith("MT1_"): env_name = benchmark_name[4:] benchmark = metaworld.MT1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = False elif benchmark_name == "MT10": benchmark = metaworld.MT10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name == "MT50": benchmark = metaworld.MT50() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = False self.augment_obs = True elif benchmark_name.startswith("ML1_train_"): env_name = benchmark_name[10:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.train_classes[env_name]} tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_train": benchmark = metaworld.ML10() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_train": benchmark = metaworld.ML45() env_dict = benchmark.train_classes tasks = benchmark.train_tasks resample_tasks = True self.augment_obs = True elif benchmark_name.startswith("ML1_test_"): env_name = benchmark_name[9:] benchmark = metaworld.ML1(env_name) env_dict = {env_name: benchmark.test_classes[env_name]} tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = False elif benchmark_name == "ML10_test": benchmark = metaworld.ML10() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True elif benchmark_name == "ML45_test": benchmark = metaworld.ML45() env_dict = benchmark.test_classes tasks = benchmark.test_tasks resample_tasks = True self.augment_obs = True else: raise NotImplementedError # Construct list of tasks for each environment, adding observability to tasks if # necessary. env_tasks = {} for task in tasks: if add_observability: task_data = dict(pickle.loads(task.data)) task_data["partially_observable"] = False task = Task(env_name=task.env_name, data=pickle.dumps(task_data)) if task.env_name in env_tasks: if resample_tasks: env_tasks[task.env_name].append(task) else: env_tasks[task.env_name] = [task] # Construct list of environment classes or class instances. self.save_memory = save_memory if self.save_memory: self.envs_info = [{ "env_name": env_name, "env_cls": env_cls, "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] else: self.envs_info = [{ "env_name": env_name, "env": env_cls(), "tasks": env_tasks[env_name] } for (env_name, env_cls) in env_dict.items()] self.num_tasks = len(self.envs_info) # Sample environment. self._sample_environment()