예제 #1
0
    def __init__(
        self,
        task_name,
    ):
        mt1 = metaworld.MT1(
            task_name)  # Construct the benchmark, sampling tasks
        Env = mt1.train_classes[task_name]
        env = Env()  # Create an environment with task `pick_place`
        env = RenderEnv(env)
        super().__init__(env)

        self.mt1 = metaworld.MT1(
            task_name)  # Construct the benchmark, sampling tasks
예제 #2
0
def test_identical_environments():
    def helper(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            np.testing.assert_equal(rand_vec_1, rand_vec_2)

    def helper_neq(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            assert not (rand_vec_1 == rand_vec_2).all()

    #testing MT1
    mt1_1 = metaworld.MT1('sweep-into-v2', seed=10)
    mt1_2 = metaworld.MT1('sweep-into-v2', seed=10)
    helper(mt1_1, mt1_2)

    #testing ML1
    ml1_1 = metaworld.ML1('sweep-into-v2', seed=10)
    ml1_2 = metaworld.ML1('sweep-into-v2', seed=10)
    helper(ml1_1, ml1_2)

    #testing MT10
    mt10_1 = metaworld.MT10(seed=10)
    mt10_2 = metaworld.MT10(seed=10)
    helper(mt10_1, mt10_2)

    # testing ML10
    ml10_1 = metaworld.ML10(seed=10)
    ml10_2 = metaworld.ML10(seed=10)
    helper(ml10_1, ml10_2)

    #testing ML45
    ml45_1 = metaworld.ML45(seed=10)
    ml45_2 = metaworld.ML45(seed=10)
    helper(ml45_1, ml45_2)

    #testing MT50
    mt50_1 = metaworld.MT50(seed=10)
    mt50_2 = metaworld.MT50(seed=10)
    helper(mt50_1, mt50_2)

    # test that 2 benchmarks with different seeds have different goals
    mt50_3 = metaworld.MT50(seed=50)
    helper_neq(mt50_1, mt50_3)
예제 #3
0
def mttrpo_metaworld_mt1_push(ctxt, seed, epochs, batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.

    """
    set_seed(seed)
    n_tasks = 50
    mt1 = metaworld.MT1('push-v1')
    train_task_sampler = MetaWorldTaskSampler(mt1, 'train',
                                              lambda env, _: normalize(env))
    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length)

    algo = TRPO(env_spec=env.spec,
                policy=policy,
                value_function=value_function,
                sampler=sampler,
                discount=0.99,
                gae_lambda=0.95)

    trainer = Trainer(ctxt)
    trainer.setup(algo, env)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
def sparse_mlp_mtsac_metaworld_mt1_pick_place(ctxt=None, *, seed, timesteps, _gpu):
    """Train MTSAC with the MT1 pick-place-v1 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        _gpu (int): The ID of the gpu to be used (used on multi-gpu machines).
        timesteps (int): Number of timesteps to run.
    """
    deterministic.set_seed(seed)
    mt1 = metaworld.MT1('pick-place-v1')
    mt1_test = metaworld.MT1('pick-place-v1')
    train_task_sampler = MetaWorldTaskSampler(mt1, 'train',
                                              lambda env, _: normalize(env))
    test_task_sampler = MetaWorldTaskSampler(mt1_test, 'train',
                                             lambda env, _: normalize(env))
    n_tasks = 50
    train_envs = train_task_sampler.sample(n_tasks)
    env = train_envs[0]()
    test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)]

    trainer = Trainer(ctxt)

    policy = TanhGaussianSparseMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[400, 400, 400],
        linear_activity_percent_on=(0.1, 0.1, 0.1),
        linear_weight_percent_on=(0.4, 0.4, 0.4),
        mean_nonlinearity=None,
        std_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousSparseMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(400, 400, 400),
        linear_activity_percent_on=(0.1, 0.1, 0.1,),
        linear_weight_percent_on=(0.4, 0.4, 0.4,),
    )

    qf2 = ContinuousSparseMLPQFunction(
        env_spec=env.spec,
        hidden_sizes=(400, 400, 400),
        linear_activity_percent_on=(0.1, 0.1, 0.1),
        linear_weight_percent_on=(0.4, 0.4, 0.4),
    )

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    sampler = LocalSampler(agents=policy,
                           envs=train_envs,
                           max_episode_length=env.spec.max_episode_length,
                           n_workers=n_tasks,
                           worker_class=FragmentWorker)

    batch_size = int(env.spec.max_episode_length * n_tasks)
    num_evaluation_points = 500
    epochs = timesteps // batch_size
    epoch_cycles = epochs // num_evaluation_points
    epochs = epochs // epoch_cycles
    mtsac = SparseWeightsMTSAC(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        sampler=sampler,
        gradient_steps_per_itr=150,
        eval_env=test_envs,
        env_spec=env.spec,
        num_tasks=1,
        steps_per_epoch=epoch_cycles,
        replay_buffer=replay_buffer,
        min_buffer_size=1500,
        target_update_tau=5e-3,
        discount=0.99,
        buffer_batch_size=1280)
    if _gpu is not None:
        set_gpu_mode(True, _gpu)
    mtsac.to()
    trainer.setup(algo=mtsac, env=train_envs)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
예제 #5
0
def te_ppo_mt1_push(ctxt, seed, n_epochs, batch_size_per_task):
    """Train Task Embedding PPO with PointEnv.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        n_epochs (int): Total number of epochs for training.
        batch_size_per_task (int): Batch size of samples for each task.

    """
    set_seed(seed)
    n_tasks = 50
    mt1 = metaworld.MT1('push-v1')
    task_sampler = MetaWorldTaskSampler(mt1,
                                        'train',
                                        lambda env, _: normalize(env),
                                        add_env_onehot=False)
    envs = [env_up() for env_up in task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    latent_length = 2
    inference_window = 6
    batch_size = batch_size_per_task * n_tasks
    policy_ent_coeff = 2e-2
    encoder_ent_coeff = 2e-4
    inference_ce_coeff = 5e-2
    embedding_init_std = 0.1
    embedding_max_std = 0.2
    embedding_min_std = 1e-6
    policy_init_std = 1.0
    policy_max_std = None
    policy_min_std = None

    with TFTrainer(snapshot_config=ctxt) as trainer:

        task_embed_spec = TEPPO.get_encoder_spec(env.task_space,
                                                 latent_dim=latent_length)

        task_encoder = GaussianMLPEncoder(
            name='embedding',
            embedding_spec=task_embed_spec,
            hidden_sizes=(20, 20),
            std_share_network=True,
            init_std=embedding_init_std,
            max_std=embedding_max_std,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        traj_embed_spec = TEPPO.get_infer_spec(
            env.spec,
            latent_dim=latent_length,
            inference_window_size=inference_window)

        inference = GaussianMLPEncoder(
            name='inference',
            embedding_spec=traj_embed_spec,
            hidden_sizes=(20, 10),
            std_share_network=True,
            init_std=2.0,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        policy = GaussianMLPTaskEmbeddingPolicy(
            name='policy',
            env_spec=env.spec,
            encoder=task_encoder,
            hidden_sizes=(32, 16),
            std_share_network=True,
            max_std=policy_max_std,
            init_std=policy_init_std,
            min_std=policy_min_std,
        )

        baseline = LinearMultiFeatureBaseline(
            env_spec=env.spec, features=['observations', 'tasks', 'latents'])

        algo = TEPPO(env_spec=env.spec,
                     policy=policy,
                     baseline=baseline,
                     inference=inference,
                     discount=0.99,
                     lr_clip_range=0.2,
                     policy_ent_coeff=policy_ent_coeff,
                     encoder_ent_coeff=encoder_ent_coeff,
                     inference_ce_coeff=inference_ce_coeff,
                     use_softplus_entropy=True,
                     optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                         learning_rate=1e-3,
                     ),
                     inference_optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                     ),
                     center_adv=True,
                     stop_ce_gradient=True)

        trainer.setup(algo,
                      env,
                      sampler_cls=LocalSampler,
                      sampler_args=None,
                      worker_class=TaskEmbeddingWorker)
        trainer.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
예제 #6
0
 def tasks():
     while True:
         mt1 = metaworld.MT1(
             task_name)  # Construct the benchmark, sampling tasks
         yield from mt1.train_tasks
예제 #7
0
    def __init__(
        self,
        benchmark_name: str,
        save_memory: bool = False,
        add_observability: bool = False,
    ) -> None:
        """ Init function for environment wrapper. """

        # We import here so that we avoid importing metaworld if possible, since it is
        # dependent on mujoco.
        import metaworld
        from metaworld import Task

        # Set config for each benchmark.
        if benchmark_name.startswith("MT1_"):
            env_name = benchmark_name[4:]
            benchmark = metaworld.MT1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = False

        elif benchmark_name == "MT10":
            benchmark = metaworld.MT10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name == "MT50":
            benchmark = metaworld.MT50()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_train_"):
            env_name = benchmark_name[10:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_train":
            benchmark = metaworld.ML10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_train":
            benchmark = metaworld.ML45()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_test_"):
            env_name = benchmark_name[9:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.test_classes[env_name]}
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_test":
            benchmark = metaworld.ML10()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_test":
            benchmark = metaworld.ML45()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        else:
            raise NotImplementedError

        # Construct list of tasks for each environment, adding observability to tasks if
        # necessary.
        env_tasks = {}
        for task in tasks:
            if add_observability:
                task_data = dict(pickle.loads(task.data))
                task_data["partially_observable"] = False
                task = Task(env_name=task.env_name,
                            data=pickle.dumps(task_data))

            if task.env_name in env_tasks:
                if resample_tasks:
                    env_tasks[task.env_name].append(task)
            else:
                env_tasks[task.env_name] = [task]

        # Construct list of environment classes or class instances.
        self.save_memory = save_memory
        if self.save_memory:
            self.envs_info = [{
                "env_name": env_name,
                "env_cls": env_cls,
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]
        else:
            self.envs_info = [{
                "env_name": env_name,
                "env": env_cls(),
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]

        self.num_tasks = len(self.envs_info)

        # Sample environment.
        self._sample_environment()