Exemplo n.º 1
0
def test_identical_environments():
    def helper(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            np.testing.assert_equal(rand_vec_1, rand_vec_2)

    def helper_neq(env, env_2):
        for i in range(len(env.train_tasks)):
            rand_vec_1 = pickle.loads(env.train_tasks[i].data)['rand_vec']
            rand_vec_2 = pickle.loads(env_2.train_tasks[i].data)['rand_vec']
            assert not (rand_vec_1 == rand_vec_2).all()

    #testing MT1
    mt1_1 = metaworld.MT1('sweep-into-v2', seed=10)
    mt1_2 = metaworld.MT1('sweep-into-v2', seed=10)
    helper(mt1_1, mt1_2)

    #testing ML1
    ml1_1 = metaworld.ML1('sweep-into-v2', seed=10)
    ml1_2 = metaworld.ML1('sweep-into-v2', seed=10)
    helper(ml1_1, ml1_2)

    #testing MT10
    mt10_1 = metaworld.MT10(seed=10)
    mt10_2 = metaworld.MT10(seed=10)
    helper(mt10_1, mt10_2)

    # testing ML10
    ml10_1 = metaworld.ML10(seed=10)
    ml10_2 = metaworld.ML10(seed=10)
    helper(ml10_1, ml10_2)

    #testing ML45
    ml45_1 = metaworld.ML45(seed=10)
    ml45_2 = metaworld.ML45(seed=10)
    helper(ml45_1, ml45_2)

    #testing MT50
    mt50_1 = metaworld.MT50(seed=10)
    mt50_2 = metaworld.MT50(seed=10)
    helper(mt50_1, mt50_2)

    # test that 2 benchmarks with different seeds have different goals
    mt50_3 = metaworld.MT50(seed=50)
    helper_neq(mt50_1, mt50_3)
Exemplo n.º 2
0
def test_metaworld_support(pass_env_id_instead_of_env_instance: bool):
    """ Test using metaworld environments as the dataset of a Setting.

    NOTE: Uses either a MetaWorldEnv instance as the `dataset`, or the env id.
    """
    import metaworld
    from metaworld import MetaWorldEnv

    benchmark = metaworld.ML10()  # Construct the benchmark, sampling tasks

    env_name = "reach-v1"
    env_type: Type[MetaWorldEnv] = benchmark.train_classes[env_name]
    env = env_type()

    training_tasks = [
        task for task in benchmark.train_tasks if task.env_name == env_name
    ]
    setting = TaskIncrementalRLSetting(
        dataset=env_name if pass_env_id_instead_of_env_instance else env,
        train_task_schedule={
            i: operator.methodcaller("set_task", task)
            for i, task in enumerate(training_tasks)
        },
        steps_per_task=1000,
        transforms=[],
    )
    assert setting.nb_tasks == 50
    assert setting.steps_per_task == 1000
    assert sorted(setting.train_task_schedule.keys()) == list(
        range(0, 50_000, 1000))

    # TODO: Clear the transforms by default, and add it back if needed?
    assert setting.train_transforms == []
    assert setting.val_transforms == []
    assert setting.test_transforms == []

    assert setting.observation_space.x == env.observation_space

    # Only test out the first 3 tasks for now.
    # TODO: Also try out the valid and test environments.
    for task_id in range(3):
        setting.current_task_id = task_id

        train_env = setting.train_dataloader()
        assert train_env.observation_space.x == env.observation_space
        assert train_env.observation_space.task_labels == spaces.Discrete(
            setting.nb_tasks)

        n_episodes = 1
        for episode in range(n_episodes):
            obs = train_env.reset()
            done = False
            steps = 0
            while not done and steps < env.max_path_length:
                obs, reward, done, info = train_env.step(
                    train_env.action_space.sample())
                # train_env.render()
                steps += 1
Exemplo n.º 3
0
def maml_trpo_metaworld_ml10(ctxt, seed, epochs, episodes_per_task,
                             meta_batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer: to create the :class:`~Snapshotter:.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        episodes_per_task (int): Number of episodes per epoch per task
            for training.
        meta_batch_size (int): Number of tasks sampled per batch.

    """
    set_seed(seed)
    ml10 = metaworld.ML10()
    tasks = MetaWorldTaskSampler(ml10, 'train')
    env = tasks.sample(10)[0]()
    test_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                  env=MetaWorldSetTaskEnv(ml10, 'test'))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length,
                         n_workers=meta_batch_size)

    trainer = Trainer(ctxt)
    algo = MAMLTRPO(env=env,
                    policy=policy,
                    sampler=sampler,
                    task_sampler=tasks,
                    value_function=value_function,
                    meta_batch_size=meta_batch_size,
                    discount=0.99,
                    gae_lambda=1.,
                    inner_lr=0.1,
                    num_grad_updates=1,
                    meta_evaluator=meta_evaluator)

    trainer.setup(algo, env)
    trainer.train(n_epochs=epochs,
                  batch_size=episodes_per_task * env.spec.max_episode_length)
Exemplo n.º 4
0
def test_metaworld_auto_task_schedule(
        pass_env_id_instead_of_env_instance: bool):
    """ Test that when passing just an env id from metaworld and a number of tasks,
    the task schedule is created automatically.
    """
    import metaworld
    from metaworld import MetaWorldEnv

    benchmark = metaworld.ML10()  # Construct the benchmark, sampling tasks

    env_name = "reach-v2"
    env_type: Type[MetaWorldEnv] = benchmark.train_classes[env_name]
    env = env_type()

    # TODO: When not passing a nb_tasks, the number of available tasks for that env
    # is used.
    # setting = TaskIncrementalRLSetting(
    #     dataset=env_name if pass_env_id_instead_of_env_instance else env,
    #     steps_per_task=1000,
    # )
    # assert setting.nb_tasks == 50
    # assert setting.steps_per_task == 1000
    # assert sorted(setting.train_task_schedule.keys()) == list(range(0, 50_000, 1000))

    # Test passing a number of tasks:

    with pytest.warns(RuntimeWarning):
        setting = TaskIncrementalRLSetting(
            dataset=env_name if pass_env_id_instead_of_env_instance else env,
            train_max_steps=2000,
            nb_tasks=2,
            test_max_steps=2000,
            transforms=[],
        )
    assert setting.nb_tasks == 2
    assert setting.steps_per_task == 1000
    assert sorted(setting.train_task_schedule.keys()) == list(
        range(0, 2000, 1000))
    from sequoia.common.metrics.rl_metrics import EpisodeMetrics

    method = DummyMethod()
    with pytest.warns(RuntimeWarning):
        results: IncrementalRLSetting.Results[EpisodeMetrics] = setting.apply(
            method)
Exemplo n.º 5
0
def test_set_task_task_sampler_ml10():
    # Import, construct environments here to avoid using up too much
    # resources if this test isn't run.
    # pylint: disable=import-outside-toplevel
    import metaworld
    ml10 = metaworld.ML10()
    tasks = task_sampler.MetaWorldTaskSampler(ml10, 'test')
    assert tasks.n_tasks == 5 * 50
    with pytest.raises(ValueError):
        tasks.sample(1)
    updates = tasks.sample(10)
    envs = [update() for update in updates]
    for env in envs:
        env.reset()
    action = envs[0].action_space.sample()
    rewards = [env.step(action).reward for env in envs]
    assert np.var(rewards) > 0
    env = envs[0]
    env.close = unittest.mock.MagicMock(name='env.close')
    updates[1](env)
    env.close.assert_not_called()
    updates[2](env)
    env.close.assert_called()
Exemplo n.º 6
0
def pearl_metaworld_ml10(ctxt=None,
                         seed=1,
                         num_epochs=1000,
                         num_train_tasks=10,
                         latent_size=7,
                         encoder_hidden_size=200,
                         net_size=300,
                         meta_batch_size=16,
                         num_steps_per_epoch=4000,
                         num_initial_steps=4000,
                         num_tasks_sample=15,
                         num_steps_prior=750,
                         num_extra_rl_steps_posterior=750,
                         batch_size=256,
                         embedding_batch_size=64,
                         embedding_mini_batch_size=64,
                         reward_scale=10.,
                         use_gpu=False):
    """Train PEARL with ML10 environments.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        num_epochs (int): Number of training epochs.
        num_train_tasks (int): Number of tasks for training.
        latent_size (int): Size of latent context vector.
        encoder_hidden_size (int): Output dimension of dense layer of the
            context encoder.
        net_size (int): Output dimension of a dense layer of Q-function and
            value function.
        meta_batch_size (int): Meta batch size.
        num_steps_per_epoch (int): Number of iterations per epoch.
        num_initial_steps (int): Number of transitions obtained per task before
            training.
        num_tasks_sample (int): Number of random tasks to obtain data for each
            iteration.
        num_steps_prior (int): Number of transitions to obtain per task with
            z ~ prior.
        num_extra_rl_steps_posterior (int): Number of additional transitions
            to obtain per task with z ~ posterior that are only used to train
            the policy and NOT the encoder.
        batch_size (int): Number of transitions in RL batch.
        embedding_batch_size (int): Number of transitions in context batch.
        embedding_mini_batch_size (int): Number of transitions in mini context
            batch; should be same as embedding_batch_size for non-recurrent
            encoder.
        reward_scale (int): Reward scale.
        use_gpu (bool): Whether or not to use GPU for training.

    """
    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    ml10 = metaworld.ML10()
    train_env = MetaWorldSetTaskEnv(ml10, 'train')
    env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                 env=train_env,
                                 wrapper=lambda env, _: normalize(env))
    env = env_sampler.sample(num_train_tasks)
    test_env = MetaWorldSetTaskEnv(ml10, 'test')
    test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                      env=test_env,
                                      wrapper=lambda env, _: normalize(env))

    trainer = Trainer(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        test_env_sampler=test_env_sampler,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        reward_scale=reward_scale,
    )

    set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    trainer.setup(algo=pearl,
                  env=env[0](),
                  sampler_cls=LocalSampler,
                  n_workers=1,
                  worker_class=PEARLWorker)

    trainer.train(n_epochs=num_epochs, batch_size=batch_size)
Exemplo n.º 7
0
import metaworld
import random

bench_name = 'ML1'

if bench_name == 'ML1':
    print(metaworld.ML1.ENV_NAMES)
    bench = metaworld.ML1('pick-place-v1')
elif bench_name == 'ML10':
    bench = metaworld.ML10()

training_envs = []
for name, env_cls in bench.train_classes.items():
    env = env_cls()
    task = random.choice(
        [task for task in bench.train_tasks if task.env_name == name])
    env.set_task(task)
    training_envs.append(env)

print(training_envs)

for env in training_envs:
    obs = env.reset()
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
Exemplo n.º 8
0
def rl2_ppo_metaworld_ml10(ctxt, seed, meta_batch_size, n_epochs,
                           episode_per_task):
    """Train RL2 PPO with ML10 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    with TFTrainer(snapshot_config=ctxt) as trainer:
        ml10 = metaworld.ML10()
        tasks = MetaWorldTaskSampler(ml10, 'train', lambda env, _: RL2Env(env))
        test_task_sampler = SetTaskSampler(MetaWorldSetTaskEnv,
                                           env=MetaWorldSetTaskEnv(
                                               ml10, 'test'),
                                           wrapper=lambda env, _: RL2Env(env))
        meta_evaluator = MetaEvaluator(test_task_sampler=test_task_sampler)

        env_updates = tasks.sample(10)
        env = env_updates[0]()

        env_spec = env.spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2PPO(meta_batch_size=meta_batch_size,
                      task_sampler=tasks,
                      env_spec=env_spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      lr_clip_range=0.2,
                      optimizer_args=dict(batch_size=32,
                                          max_optimization_epochs=10),
                      stop_entropy_gradient=True,
                      entropy_method='max',
                      policy_ent_coeff=0.02,
                      center_adv=False,
                      meta_evaluator=meta_evaluator,
                      episodes_per_trial=episode_per_task)

        trainer.setup(algo,
                      tasks.sample(meta_batch_size),
                      sampler_cls=LocalSampler,
                      n_workers=meta_batch_size,
                      worker_class=RL2Worker,
                      worker_args=dict(n_episodes_per_trial=episode_per_task))

        trainer.train(n_epochs=n_epochs,
                      batch_size=episode_per_task *
                      env_spec.max_episode_length * meta_batch_size)
Exemplo n.º 9
0
except ImportError:
    # Create a 'dummy' class so we can safely use MTEnv in the type hints below.
    # Additionally, isinstance(some_env, MTEnv) will always fail when mtenv isn't
    # installed, which is good.
    class MTEnv(gym.Env):
        pass


metaworld_installed = False
metaworld_envs = []
try:
    import metaworld
    from metaworld import MetaWorldEnv
    from metaworld.envs.mujoco.mujoco_env import MujocoEnv

    metaworld_envs = list(metaworld.ML10().train_classes.keys())
    metaworld_installed = True
except ImportError:
    # Create a 'dummy' class so we can safely use MetaWorldEnv in the type hints below.
    # Additionally, isinstance(some_env, MetaWorldEnv) will always fail when metaworld
    # isn't installed, which is good.
    class MetaWorldEnv(gym.Env):
        pass

    class MujocoEnv(gym.Env):
        pass


@dataclass
class IncrementalRLSetting(ContinualRLSetting):
    """ Continual RL setting the data is divided into 'tasks' with clear boundaries.
Exemplo n.º 10
0
    def __init__(
        self,
        benchmark_name: str,
        save_memory: bool = False,
        add_observability: bool = False,
    ) -> None:
        """ Init function for environment wrapper. """

        # We import here so that we avoid importing metaworld if possible, since it is
        # dependent on mujoco.
        import metaworld
        from metaworld import Task

        # Set config for each benchmark.
        if benchmark_name.startswith("MT1_"):
            env_name = benchmark_name[4:]
            benchmark = metaworld.MT1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = False

        elif benchmark_name == "MT10":
            benchmark = metaworld.MT10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name == "MT50":
            benchmark = metaworld.MT50()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = False
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_train_"):
            env_name = benchmark_name[10:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.train_classes[env_name]}
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_train":
            benchmark = metaworld.ML10()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_train":
            benchmark = metaworld.ML45()
            env_dict = benchmark.train_classes
            tasks = benchmark.train_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name.startswith("ML1_test_"):
            env_name = benchmark_name[9:]
            benchmark = metaworld.ML1(env_name)
            env_dict = {env_name: benchmark.test_classes[env_name]}
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = False

        elif benchmark_name == "ML10_test":
            benchmark = metaworld.ML10()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        elif benchmark_name == "ML45_test":
            benchmark = metaworld.ML45()
            env_dict = benchmark.test_classes
            tasks = benchmark.test_tasks
            resample_tasks = True
            self.augment_obs = True

        else:
            raise NotImplementedError

        # Construct list of tasks for each environment, adding observability to tasks if
        # necessary.
        env_tasks = {}
        for task in tasks:
            if add_observability:
                task_data = dict(pickle.loads(task.data))
                task_data["partially_observable"] = False
                task = Task(env_name=task.env_name,
                            data=pickle.dumps(task_data))

            if task.env_name in env_tasks:
                if resample_tasks:
                    env_tasks[task.env_name].append(task)
            else:
                env_tasks[task.env_name] = [task]

        # Construct list of environment classes or class instances.
        self.save_memory = save_memory
        if self.save_memory:
            self.envs_info = [{
                "env_name": env_name,
                "env_cls": env_cls,
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]
        else:
            self.envs_info = [{
                "env_name": env_name,
                "env": env_cls(),
                "tasks": env_tasks[env_name]
            } for (env_name, env_cls) in env_dict.items()]

        self.num_tasks = len(self.envs_info)

        # Sample environment.
        self._sample_environment()