示例#1
0
def rl2_ppo_ml1(ctxt, seed, max_path_length, meta_batch_size, n_epochs,
                episode_per_task):
    """Train PPO with ML1 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        max_path_length (int): Maximum length of a single rollout.
        meta_batch_size (int): Meta batch size.
        n_epochs (int): Total number of epochs for training.
        episode_per_task (int): Number of training episode per task.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        tasks = task_sampler.SetTaskSampler(
            lambda: RL2Env(env=ML1.get_train_tasks('push-v1')))

        env_spec = RL2Env(env=ML1.get_train_tasks('push-v1')).spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        algo = RL2PPO(rl2_max_path_length=max_path_length,
                      meta_batch_size=meta_batch_size,
                      task_sampler=tasks,
                      env_spec=env_spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      lr_clip_range=0.2,
                      optimizer_args=dict(
                          batch_size=32,
                          max_epochs=10,
                      ),
                      stop_entropy_gradient=True,
                      entropy_method='max',
                      policy_ent_coeff=0.02,
                      center_adv=False,
                      max_path_length=max_path_length * episode_per_task)

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker,
                     worker_args=dict(n_paths_per_trial=episode_per_task))

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_path_length *
                     meta_batch_size)
示例#2
0
def _prepare_meta_env(env):
    if ML:
        if env_ind == 2:
            task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('push-v1'), random_init=False))
        elif env_ind == 3:
            task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('reach-v1'), random_init=False))
        elif env_ind == 4:
            task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('pick-place-v1'), random_init=False))
    else:
        task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(env()))
    return task_samplers.sample(1)[0](), task_samplers
示例#3
0
def test_pickling(task):
    if task in HARD_MODE_CLS_DICT['train']:
        env = ML1.get_train_tasks(task)
    else:
        env = ML1.get_test_tasks(task)

    env2 = pkl.loads(pkl.dumps(env))

    assert len(env._task_names) == 1
    assert len(env._task_names) == len(env2._task_names)
    assert env._task_names[0] == env2._task_names[0]
    np.testing.assert_equal(env._discrete_goals, env2._discrete_goals)
示例#4
0
    def test_benchmark_pearl(self):
        '''
        Compare benchmarks between metarl and baselines.
        :return:
        '''
        env_sampler = SetTaskSampler(
            lambda: MetaRLEnv(normalize(ML1.get_train_tasks('reach-v1'))))
        env = env_sampler.sample(params['num_train_tasks'])
        test_env_sampler = SetTaskSampler(
            lambda: MetaRLEnv(normalize(ML1.get_test_tasks('reach-v1'))))
        test_env = test_env_sampler.sample(params['num_train_tasks'])
        env_id = 'reach-v1'
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
        benchmark_dir = osp.join(os.getcwd(), 'data', 'local', 'benchmarks',
                                 'pearl', timestamp)
        result_json = {}
        seeds = random.sample(range(100), params['n_trials'])
        task_dir = osp.join(benchmark_dir, env_id)
        plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_id))
        metarl_csvs = []

        for trial in range(params['n_trials']):
            seed = seeds[trial]
            trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed)
            metarl_dir = trial_dir + '/metarl'

            metarl_csv = run_metarl(env, test_env, seed, metarl_dir)
            metarl_csvs.append(metarl_csv)

        env.close()

        benchmark_helper.plot_average_over_trials(
            [metarl_csvs],
            ys=['Test/Average/SuccessRate'],
            plt_file=plt_file,
            env_id=env_id,
            x_label='TotalEnvSteps',
            y_label='Test/Average/SuccessRate',
            names=['metarl_pearl'],
        )

        factor_val = params['meta_batch_size'] * params['max_path_length']
        result_json[env_id] = benchmark_helper.create_json(
            [metarl_csvs],
            seeds=seeds,
            trials=params['n_trials'],
            xs=['TotalEnvSteps'],
            ys=['Test/Average/SuccessRate'],
            factors=[factor_val],
            names=['metarl_pearl'])

        Rh.write_file(result_json, 'PEARL')
示例#5
0
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        rollouts_per_task (int): Number of rollouts per epoch per task
            for training.
        meta_batch_size (int): Number of tasks sampled per batch.

    """
    set_seed(seed)
    env = GarageEnv(
        normalize(ML1.get_train_tasks('push-v1'), expected_action_scale=10.))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(100, 100),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = LinearFeatureBaseline(env_spec=env.spec)

    max_path_length = 100

    test_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1'))))

    meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler,
                                   max_path_length=max_path_length)

    runner = LocalRunner(ctxt)
    algo = MAMLTRPO(env=env,
                    policy=policy,
                    value_function=value_function,
                    max_path_length=max_path_length,
                    meta_batch_size=meta_batch_size,
                    discount=0.99,
                    gae_lambda=1.,
                    inner_lr=0.1,
                    num_grad_updates=1,
                    meta_evaluator=meta_evaluator)

    runner.setup(algo, env)
    runner.train(n_epochs=epochs,
                 batch_size=rollouts_per_task * max_path_length)
示例#6
0
def run_task(args,*_):
    
    #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO
    metaworld_env = ML1.get_train_tasks("pick-place-v1")
    tasks = metaworld_env.sample_tasks(1)
    metaworld_env.set_task(tasks[0])
    metaworld_env._observation_space = convert_gym_space(metaworld_env.observation_space)
    metaworld_env._action_space = convert_gym_space(metaworld_env.action_space)
    env = TfEnv(normalize(metaworld_env))

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        min_std=1e-2,
        hidden_sizes=(150, 100, 50),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        #batch_size=50000,
        batch_size=100,
        force_batch_sampler=True,
        max_path_length=50,
        discount=1,
        step_size=0.02,
    )
    
    algo.train()
示例#7
0
def test_all_ml1(name):
    train_env = ML1.get_train_tasks(name)
    tasks = train_env.sample_tasks(11)
    for t in tasks:
        train_env.set_task(t)
        step_env(train_env, max_path_length=3)

    train_env.close()
    del train_env

    test_env = ML1.get_test_tasks(name)
    tasks = test_env.sample_tasks(11)
    for t in tasks:
        test_env.set_task(t)
        step_env(test_env, max_path_length=3)

    test_env.close()
    del test_env
示例#8
0
 def __init__(self,
              max_episode_steps=150,
              out_of_distribution=False,
              n_train_tasks=50,
              n_test_tasks=10,
              **kwargs):
     super(ReachML1Env, self).__init__()
     self.train_env = ML1.get_train_tasks(
         'reach-v1', out_of_distribution=out_of_distribution)
     self.test_env = ML1.get_test_tasks(
         'reach-v1', out_of_distribution=out_of_distribution)
     self.train_tasks = self.train_env.sample_tasks(n_train_tasks)
     self.test_tasks = self.test_env.sample_tasks(n_test_tasks)
     self.tasks = self.train_tasks + self.test_tasks
     self.env = self.train_env  #this env will change depending on the idx
     self.observation_space = self.env.observation_space
     self.action_space = self.env.action_space
     self.goal_space_origin = np.array([0, 0.85, 0.175])
     self.current_task_idx = 0
     self.episode_steps = 0
     self._max_episode_steps = max_episode_steps
def test_ml1_random_init(task_name):
    env = ML1.get_train_tasks(task_name)
    tasks = env.sample_tasks(1)
    env.set_task(tasks[0])

    actual_goal = env.active_env.goal

    _ = env.reset()
    _, _, _, info_0 = env.step(env.action_space.sample())

    _ = env.reset()
    _, _, _, info_1 = env.step(env.action_space.sample())

    assert np.all(info_0['goal'] == info_1['goal'])
    assert np.all(info_0['goal'] == actual_goal)
def run_task(args, *_):

    #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO
    metaworld_env = ML1.get_train_tasks(
        'pick-place-v1')  # Create an environment with task `pick_place`
    tasks = metaworld_env.sample_tasks(
        1)  # Sample a task (in this case, a goal variation)
    metaworld_env.set_task(tasks[0])  # Set task
    # print(metaworld_env.id)
    # print("HERE")
    # import pdb;pdb.set_trace()
    metaworld_env = GymEnv2(metaworld_env)
    # metaworld_env.observation_space = convert_gym_space(metaworld_env.observation_space)
    # metaworld_env.action_space = convert_gym_space(metaworld_env.action_space)

    # env = metaworld_env
    env = TfEnv(normalize(metaworld_env))  # Cannot be solved easily by TRPO

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        min_std=1e-2,
        hidden_sizes=(150, 100, 50),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1,
        force_batch_sampler=True,
        max_path_length=50,
        discount=1,
        step_size=0.02,
    )

    algo.train()
示例#11
0
        print(type(env))
        max_path_length = 64
    elif args.env == 'Walker':
        env = TfEnv(normalize(WalkerEnv_sparse()))
        print(type(env))
        max_path_length = 64
    elif args.env == 'Ant':
        env = TfEnv(normalize(AntGoalEnvSparse()))
        print(type(env))
        max_path_length = 64
    elif args.env == 'Cheetah':
        env = TfEnv(normalize(HalfCheetahVelEnv_sparse()))
        print(type(env))
        max_path_length = 64
    elif args.env == 'Push':
        env = TfEnv(normalize(ML1.get_train_tasks('push-v1')))
        max_path_length = 150
    elif args.env == 'Reach':

        env = TfEnv(normalize(ML1.get_train_tasks('reach-v1')))
        max_path_length = 150

    else:
        raise AssertionError('Not Implemented')
    ########################################################

    #####################Algo Selection####################
    if args.algo == 'Maesn':
        assert v[
            'fast_learning_rate'] != 0, 'Fast learning rate needs to be non 0 for Maesn'
        policy = adaGaussPolicy(
示例#12
0
    def test_benchmark_rl2(self):  # pylint: disable=no-self-use
        """Compare benchmarks between metarl and baselines."""
        if ML:
            if env_ind == 2:
                envs = [ML1.get_train_tasks('push-v1')]
                env_ids = ['ML1-push-v1']
            elif env_ind == 3:
                envs = [ML1.get_train_tasks('reach-v1')]
                env_ids = ['ML1-reach-v1']
            elif env_ind == 4:
                envs = [ML1.get_train_tasks('pick-place-v1')]
                env_ids = ['ML1-pick-place-v1']
            else:
                raise ValueError("Env index is wrong")
        else:
            if env_ind == 0:
                envs = [HalfCheetahVelEnv]
                env_ids = ['HalfCheetahVelEnv']
            elif env_ind == 1:
                envs = [HalfCheetahDirEnv]
                env_ids = ['HalfCheetahDirEnv']
            else:
                raise ValueError("Env index is wrong")
        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
        benchmark_dir = './data/local/benchmarks/rl2/%s/' % timestamp
        result_json = {}
        for i, env in enumerate(envs):
            seeds = random.sample(range(100), hyper_parameters['n_trials'])
            task_dir = osp.join(benchmark_dir, env_ids[i])
            plt_file = osp.join(benchmark_dir,
                                '{}_benchmark.png'.format(env_ids[i]))
            metarl_tf_csvs = []
            promp_csvs = []

            for trial in range(hyper_parameters['n_trials']):
                seed = seeds[trial]
                trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed)
                promp_dir = trial_dir + '/promp'

                with tf.Graph().as_default():
                    if isinstance(env, gym.Env):
                        env.reset()
                        promp_csv = run_promp(env, seed, promp_dir)
                    else:
                        promp_csv = run_promp(env(), seed, promp_dir)

                promp_csvs.append(promp_csv)

            with open(osp.join(promp_dir, 'parameters.txt'), 'w') as outfile:
                json.dump(hyper_parameters, outfile)

            if isinstance(env, gym.Env):
                env.close()

            p_x = 'n_timesteps'

            if ML:
                p_ys = ['train-AverageReturn', 'train-SuccessRate']
            else:
                p_ys = ['train-AverageReturn']

            for p_y in p_ys:
                plt_file = osp.join(
                    benchmark_dir,
                    '{}_benchmark_promp_{}.png'.format(env_ids[i],
                                                       p_y.replace('/', '-')))
                Rh.relplot(g_csvs=promp_csvs,
                           b_csvs=None,
                           g_x=p_x,
                           g_y=p_y,
                           g_z='ProMP',
                           b_x=None,
                           b_y=None,
                           b_z='None',
                           trials=hyper_parameters['n_trials'],
                           seeds=seeds,
                           plt_file=plt_file,
                           env_id=env_ids[i])
示例#13
0
def te_ppo_ml1_push(ctxt, seed, n_epochs, batch_size_per_task):
    """Train Task Embedding PPO with PointEnv.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~LocalRunner` to create the :class:`~Snapshotter`.
        seed (int): Used to seed the random number generator to produce
            determinism.
        n_epochs (int): Total number of epochs for training.
        batch_size_per_task (int): Batch size of samples for each task.

    """
    set_seed(seed)
    envs = [
        normalize(
            GymEnv(ML1.get_train_tasks('push-v1'), max_episode_length=150))
    ]
    env = MultiEnvWrapper(envs, mode='del-onehot')

    latent_length = 2
    inference_window = 6
    batch_size = batch_size_per_task
    policy_ent_coeff = 2e-2
    encoder_ent_coeff = 2e-4
    inference_ce_coeff = 5e-2
    embedding_init_std = 0.1
    embedding_max_std = 0.2
    embedding_min_std = 1e-6
    policy_init_std = 1.0
    policy_max_std = None
    policy_min_std = None

    with LocalTFRunner(snapshot_config=ctxt) as runner:

        task_embed_spec = TEPPO.get_encoder_spec(env.task_space,
                                                 latent_dim=latent_length)

        task_encoder = GaussianMLPEncoder(
            name='embedding',
            embedding_spec=task_embed_spec,
            hidden_sizes=(20, 20),
            std_share_network=True,
            init_std=embedding_init_std,
            max_std=embedding_max_std,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        traj_embed_spec = TEPPO.get_infer_spec(
            env.spec,
            latent_dim=latent_length,
            inference_window_size=inference_window)

        inference = GaussianMLPEncoder(
            name='inference',
            embedding_spec=traj_embed_spec,
            hidden_sizes=(20, 10),
            std_share_network=True,
            init_std=2.0,
            output_nonlinearity=tf.nn.tanh,
            min_std=embedding_min_std,
        )

        policy = GaussianMLPTaskEmbeddingPolicy(
            name='policy',
            env_spec=env.spec,
            encoder=task_encoder,
            hidden_sizes=(32, 16),
            std_share_network=True,
            max_std=policy_max_std,
            init_std=policy_init_std,
            min_std=policy_min_std,
        )

        baseline = LinearMultiFeatureBaseline(
            env_spec=env.spec, features=['observations', 'tasks', 'latents'])

        algo = TEPPO(env_spec=env.spec,
                     policy=policy,
                     baseline=baseline,
                     inference=inference,
                     discount=0.99,
                     lr_clip_range=0.2,
                     policy_ent_coeff=policy_ent_coeff,
                     encoder_ent_coeff=encoder_ent_coeff,
                     inference_ce_coeff=inference_ce_coeff,
                     use_softplus_entropy=True,
                     optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                         learning_rate=1e-3,
                     ),
                     inference_optimizer_args=dict(
                         batch_size=32,
                         max_optimization_epochs=10,
                     ),
                     center_adv=True,
                     stop_ce_gradient=True)

        runner.setup(algo,
                     env,
                     sampler_cls=LocalSampler,
                     sampler_args=None,
                     worker_class=TaskEmbeddingWorker)
        runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
示例#14
0
from metaworld.benchmarks import ML1
import time

print(ML1.available_tasks())  # Check out the available tasks

env = ML1.get_train_tasks(
    'pick-place-v1')  # Create an environment with task `pick_place`
tasks = env.sample_tasks(1)  # Sample a task (in this case, a goal variation)
env.set_task(tasks[0])  # Set task

obs = env.reset()  # Reset environment

for i in range(1000):
    print('iteration %d' % (i))
    if i % 100 == 0:
        obs = env.reset()
    env.render()
    a = env.action_space.sample()  # Sample an action
    obs, reward, done, info = env.step(a)
    print(obs)  # Step the environoment with the sampled random action
    time.sleep(0.2)
示例#15
0
def torch_pearl_ml1_push(ctxt=None,
                         seed=1,
                         num_epochs=1000,
                         num_train_tasks=50,
                         num_test_tasks=10,
                         latent_size=7,
                         encoder_hidden_size=200,
                         net_size=300,
                         meta_batch_size=16,
                         num_steps_per_epoch=4000,
                         num_initial_steps=4000,
                         num_tasks_sample=15,
                         num_steps_prior=750,
                         num_extra_rl_steps_posterior=750,
                         batch_size=256,
                         embedding_batch_size=64,
                         embedding_mini_batch_size=64,
                         max_path_length=150,
                         reward_scale=10.,
                         use_gpu=False):
    """Train PEARL with ML1 environments.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        num_epochs (int): Number of training epochs.
        num_train_tasks (int): Number of tasks for training.
        num_test_tasks (int): Number of tasks for testing.
        latent_size (int): Size of latent context vector.
        encoder_hidden_size (int): Output dimension of dense layer of the
            context encoder.
        net_size (int): Output dimension of a dense layer of Q-function and
            value function.
        meta_batch_size (int): Meta batch size.
        num_steps_per_epoch (int): Number of iterations per epoch.
        num_initial_steps (int): Number of transitions obtained per task before
            training.
        num_tasks_sample (int): Number of random tasks to obtain data for each
            iteration.
        num_steps_prior (int): Number of transitions to obtain per task with
            z ~ prior.
        num_extra_rl_steps_posterior (int): Number of additional transitions
            to obtain per task with z ~ posterior that are only used to train
            the policy and NOT the encoder.
        batch_size (int): Number of transitions in RL batch.
        embedding_batch_size (int): Number of transitions in context batch.
        embedding_mini_batch_size (int): Number of transitions in mini context
            batch; should be same as embedding_batch_size for non-recurrent
            encoder.
        max_path_length (int): Maximum path length.
        reward_scale (int): Reward scale.
        use_gpu (bool): Whether or not to use GPU for training.

    """
    set_seed(seed)
    encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size,
                            encoder_hidden_size)
    # create multi-task environment and sample tasks
    env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(ML1.get_train_tasks('push-v1'))))
    env = env_sampler.sample(num_train_tasks)

    test_env_sampler = SetTaskSampler(
        lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1'))))

    runner = LocalRunner(ctxt)

    # instantiate networks
    augmented_env = PEARL.augment_env_spec(env[0](), latent_size)
    qf = ContinuousMLPQFunction(env_spec=augmented_env,
                                hidden_sizes=[net_size, net_size, net_size])

    vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf')
    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    inner_policy = TanhGaussianMLPPolicy(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    pearl = PEARL(
        env=env,
        policy_class=ContextConditionedPolicy,
        encoder_class=MLPEncoder,
        inner_policy=inner_policy,
        qf=qf,
        vf=vf,
        num_train_tasks=num_train_tasks,
        num_test_tasks=num_test_tasks,
        latent_dim=latent_size,
        encoder_hidden_sizes=encoder_hidden_sizes,
        meta_batch_size=meta_batch_size,
        num_steps_per_epoch=num_steps_per_epoch,
        num_initial_steps=num_initial_steps,
        num_tasks_sample=num_tasks_sample,
        num_steps_prior=num_steps_prior,
        num_extra_rl_steps_posterior=num_extra_rl_steps_posterior,
        batch_size=batch_size,
        embedding_batch_size=embedding_batch_size,
        embedding_mini_batch_size=embedding_mini_batch_size,
        max_path_length=max_path_length,
        reward_scale=reward_scale,
    )

    tu.set_gpu_mode(use_gpu, gpu_id=0)
    if use_gpu:
        pearl.to()

    runner.setup(algo=pearl,
                 env=env[0](),
                 sampler_cls=LocalSampler,
                 sampler_args=dict(max_path_length=max_path_length),
                 n_workers=1,
                 worker_class=PEARLWorker)

    worker_args = dict(deterministic=True, accum_context=True)
    meta_evaluator = MetaEvaluator(test_task_sampler=test_env_sampler,
                                   max_path_length=max_path_length,
                                   worker_class=PEARLWorker,
                                   worker_args=worker_args,
                                   n_test_tasks=num_test_tasks)
    pearl.evaluator = meta_evaluator
    runner.train(n_epochs=num_epochs, batch_size=batch_size)
示例#16
0
    def _thunk():
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            env = dm_control2gym.make(domain_name=domain, task_name=task)
        elif env_id.startswith('metaworld_'):
            world_bench = env_id.split('_')[1]
            if world_bench.startswith('ml1.'):
                world_task = world_bench.split('.')[1]
                env = ML1.get_train_tasks(world_task)
            elif world_bench == 'ml10':
                env = ML10.get_train_tasks()
            elif world_bench == 'mt10':
                env = MT10.get_train_tasks()
            else:
                raise 'This code only supports metaworld ml1, ml10 or mt10.'

            env = MetaworldWrapper(env)
        else:
            env = gym.make(env_id)

        if obs_keys is not None:
            env = gym.wrappers.FlattenDictWrapper(env, dict_keys=obs_keys)

        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)

        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        # if str(env.__class__.__name__).find('TimeLimit') >= 0:
        #     env = TimeLimitMask(env)

        if log_dir is not None:
            if save_video:
                env = bench.Monitor(env,
                                    os.path.join(log_dir + '/eval/monitor',
                                                 str(rank)),
                                    allow_early_resets=allow_early_resets)

                env = gym.wrappers.Monitor(env,
                                           os.path.join(
                                               log_dir + '/eval/video',
                                               str(rank)),
                                           force=True)
            else:
                env = bench.Monitor(env,
                                    os.path.join(log_dir + '/monitor',
                                                 str(rank)),
                                    allow_early_resets=allow_early_resets)

        if is_atari:
            if len(env.observation_space.shape) == 3:
                env = wrap_deepmind(env)
        elif len(env.observation_space.shape) == 3:
            raise NotImplementedError(
                "CNN models work only for atari,\n"
                "please use a custom wrapper for a custom pixel input env.\n"
                "See wrap_deepmind for an example.")

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env, op=[2, 0, 1])

        return env
示例#17
0
def rl2_ppo_halfcheetah(ctxt=None, seed=1):
    """Train PPO with HalfCheetah environment.

    Args:
        ctxt (metarl.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        max_path_length = 100
        meta_batch_size = 10
        n_epochs = 50
        episode_per_task = 4

        # ---- For ML1-push
        from metaworld.benchmarks import ML1        
        tasks = task_sampler.SetTaskSampler(lambda: RL2Env(
            env=ML1.get_train_tasks('push-v1')))

        # ---- For HalfCheetahVel
        # tasks = task_sampler.SetTaskSampler(lambda: RL2Env(
        #     env=HalfCheetahVelEnv()))

        env_spec = tasks.sample(1)[0]().spec
        policy = GaussianGRUPolicy(name='policy',
                                   hidden_dim=64,
                                   env_spec=env_spec,
                                   state_include_action=False)

        baseline = LinearFeatureBaseline(env_spec=env_spec)

        inner_algo = RL2PPO(
            env_spec=env_spec,
            policy=policy,
            baseline=baseline,
            max_path_length=max_path_length * episode_per_task,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
            ),
            stop_entropy_gradient=True,
            entropy_method='max',
            policy_ent_coeff=0.02,
            center_adv=False,
        )

        algo = RL2(policy=policy,
                   inner_algo=inner_algo,
                   max_path_length=max_path_length,
                   meta_batch_size=meta_batch_size,
                   task_sampler=tasks)

        runner.setup(algo,
                     tasks.sample(meta_batch_size),
                     sampler_cls=LocalSampler,
                     n_workers=meta_batch_size,
                     worker_class=RL2Worker)

        runner.train(n_epochs=n_epochs,
                     batch_size=episode_per_task * max_path_length *
                     meta_batch_size)
    def test_pearl_ml1_push(self):
        """Test PEARL with ML1 Push environment."""
        params = dict(seed=1,
                      num_epochs=1,
                      num_train_tasks=5,
                      num_test_tasks=1,
                      latent_size=7,
                      encoder_hidden_sizes=[10, 10, 10],
                      net_size=30,
                      meta_batch_size=16,
                      num_steps_per_epoch=40,
                      num_initial_steps=40,
                      num_tasks_sample=15,
                      num_steps_prior=15,
                      num_extra_rl_steps_posterior=15,
                      batch_size=256,
                      embedding_batch_size=8,
                      embedding_mini_batch_size=8,
                      max_path_length=50,
                      reward_scale=10.,
                      use_information_bottleneck=True,
                      use_next_obs_in_context=False,
                      use_gpu=False)

        net_size = params['net_size']
        set_seed(params['seed'])
        env_sampler = SetTaskSampler(
            lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1'))))
        env = env_sampler.sample(params['num_train_tasks'])

        test_env_sampler = SetTaskSampler(
            lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1'))))

        augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size'])
        qf = ContinuousMLPQFunction(
            env_spec=augmented_env,
            hidden_sizes=[net_size, net_size, net_size])

        vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf')
        vf = ContinuousMLPQFunction(
            env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size])

        inner_policy = TanhGaussianMLPPolicy(
            env_spec=augmented_env,
            hidden_sizes=[net_size, net_size, net_size])

        pearl = PEARL(
            env=env,
            policy_class=ContextConditionedPolicy,
            encoder_class=MLPEncoder,
            inner_policy=inner_policy,
            qf=qf,
            vf=vf,
            num_train_tasks=params['num_train_tasks'],
            num_test_tasks=params['num_test_tasks'],
            latent_dim=params['latent_size'],
            encoder_hidden_sizes=params['encoder_hidden_sizes'],
            test_env_sampler=test_env_sampler,
            meta_batch_size=params['meta_batch_size'],
            num_steps_per_epoch=params['num_steps_per_epoch'],
            num_initial_steps=params['num_initial_steps'],
            num_tasks_sample=params['num_tasks_sample'],
            num_steps_prior=params['num_steps_prior'],
            num_extra_rl_steps_posterior=params[
                'num_extra_rl_steps_posterior'],
            batch_size=params['batch_size'],
            embedding_batch_size=params['embedding_batch_size'],
            embedding_mini_batch_size=params['embedding_mini_batch_size'],
            max_path_length=params['max_path_length'],
            reward_scale=params['reward_scale'],
        )

        set_gpu_mode(params['use_gpu'], gpu_id=0)
        if params['use_gpu']:
            pearl.to()

        runner = LocalRunner(snapshot_config)
        runner.setup(
            algo=pearl,
            env=env[0](),
            sampler_cls=LocalSampler,
            sampler_args=dict(max_path_length=params['max_path_length']),
            n_workers=1,
            worker_class=PEARLWorker)

        runner.train(n_epochs=params['num_epochs'],
                     batch_size=params['batch_size'])
示例#19
0
import pytest

from metaworld.benchmarks import ML1, MT10, ML10, ML45, MT50
from tests.helpers import step_env


@pytest.mark.parametrize('name', ML1.available_tasks())
def test_all_ml1(name):
    train_env = ML1.get_train_tasks(name)
    tasks = train_env.sample_tasks(11)
    for t in tasks:
        train_env.set_task(t)
        step_env(train_env, max_path_length=3)

    train_env.close()
    del train_env

    test_env = ML1.get_test_tasks(name)
    tasks = test_env.sample_tasks(11)
    for t in tasks:
        test_env.set_task(t)
        step_env(test_env, max_path_length=3)

    test_env.close()
    del test_env


def test_all_ml10():
    ml10_train_env = ML10.get_train_tasks()
    train_tasks = ml10_train_env.sample_tasks(11)
    for t in train_tasks:
示例#20
0
def sac(args, steps_per_epoch=1500, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=3e-4, batch_size=128, start_steps=1000,
        update_after=1000, update_every=1, num_test_episodes=10, max_ep_len=150,
        logger_kwargs=dict(), save_freq=1):

    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)

    torch.set_num_threads(torch.get_num_threads())

    actor_critic = core.MLPActorCritic
    ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
    gamma = args.gamma
    seed = args.seed
    epochs = args.epochs
    logger_tensor = Logger(logdir=args.logdir, run_name="{}-{}".format(args.model_name, time.ctime()))

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env = ML1.get_train_tasks('reach-v1')  # Create an environment with task `pick_place`
    tasks = env.sample_tasks(1)  # Sample a task (in this case, a goal variation)
    env.set_task(tasks[0])  # Set task

    test_env = ML1.get_train_tasks('reach-v1')  # Create an environment with task `pick_place`
    tasks = env.sample_tasks(1)  # Sample a task (in this case, a goal variation)
    test_env.set_task(tasks[0])  # Set task

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup) ** 2).mean()
        loss_q2 = ((q2 - backup) ** 2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(),
                      Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=3e-4)
    q_optimizer = Adam(q_params, lr=3e-4)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, logger_tensor, t):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)
        logger_tensor.log_value(t, loss_q.item(), "loss q")

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)
        logger_tensor.log_value(t, loss_pi.item(), "loss pi")

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32),
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            logger_tensor.log_value(t, ep_ret, "test ep reward")
            logger_tensor.log_value(t, ep_len, "test ep length")

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d
        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger_tensor.log_value(t, ep_ret, "reward")
            logging.info("> total_steps={} | reward={}".format(t, ep_ret))
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0


        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, logger_tensor = logger_tensor, t = t)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)

            logger_tensor.log_value(t, epoch, "epoch")
            logger.dump_tabular(logger_tensor=logger_tensor,epoch = epoch)
            ac.save(args.save_model_dir, args.model_name)
示例#21
0
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.

    Args:
        snapshot_config (metarl.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters

    """
    # create multi-task environment and sample tasks
    env_sampler = SetTaskSampler(
        lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1'))))
    env = env_sampler.sample(params['num_train_tasks'])
    test_env_sampler = SetTaskSampler(
        lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1'))))
    test_env = test_env_sampler.sample(params['num_train_tasks'])

    runner = LocalRunner(snapshot_config)
    obs_dim = int(np.prod(env[0]().observation_space.shape))
    action_dim = int(np.prod(env[0]().action_space.shape))
    reward_dim = 1

    # instantiate networks
    encoder_in_dim = obs_dim + action_dim + reward_dim
    encoder_out_dim = params['latent_size'] * 2
    net_size = params['net_size']

    context_encoder = MLPEncoder(input_dim=encoder_in_dim,
                                 output_dim=encoder_out_dim,
                                 hidden_sizes=[200, 200, 200])

    space_a = akro.Box(low=-1,
                       high=1,
                       shape=(obs_dim + params['latent_size'], ),
                       dtype=np.float32)
    space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32)
    augmented_env = EnvSpec(space_a, space_b)

    qf1 = ContinuousMLPQFunction(env_spec=augmented_env,
                                 hidden_sizes=[net_size, net_size, net_size])

    qf2 = ContinuousMLPQFunction(env_spec=augmented_env,
                                 hidden_sizes=[net_size, net_size, net_size])

    obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32)
    action_space = akro.Box(low=-1,
                            high=1,
                            shape=(params['latent_size'], ),
                            dtype=np.float32)
    vf_env = EnvSpec(obs_space, action_space)

    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    policy = TanhGaussianMLPPolicy2(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    context_conditioned_policy = ContextConditionedPolicy(
        latent_dim=params['latent_size'],
        context_encoder=context_encoder,
        policy=policy,
        use_ib=params['use_information_bottleneck'],
        use_next_obs=params['use_next_obs_in_context'],
    )

    pearlsac = PEARLSAC(
        env=env,
        test_env=test_env,
        policy=context_conditioned_policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        num_train_tasks=params['num_train_tasks'],
        num_test_tasks=params['num_test_tasks'],
        latent_dim=params['latent_size'],
        meta_batch_size=params['meta_batch_size'],
        num_steps_per_epoch=params['num_steps_per_epoch'],
        num_initial_steps=params['num_initial_steps'],
        num_tasks_sample=params['num_tasks_sample'],
        num_steps_prior=params['num_steps_prior'],
        num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'],
        num_evals=params['num_evals'],
        num_steps_per_eval=params['num_steps_per_eval'],
        batch_size=params['batch_size'],
        embedding_batch_size=params['embedding_batch_size'],
        embedding_mini_batch_size=params['embedding_mini_batch_size'],
        max_path_length=params['max_path_length'],
        reward_scale=params['reward_scale'],
    )

    tu.set_gpu_mode(params['use_gpu'], gpu_id=0)
    if params['use_gpu']:
        pearlsac.to()

    runner.setup(algo=pearlsac,
                 env=env,
                 sampler_cls=PEARLSampler,
                 sampler_args=dict(max_path_length=params['max_path_length']))
    runner.train(n_epochs=params['num_epochs'],
                 batch_size=params['batch_size'])
示例#22
0
    def test_benchmark_rl2(self):  # pylint: disable=no-self-use
        """Compare benchmarks between metarl and baselines."""
        if ML:
            if env_ind == 2:
                envs = [ML1.get_train_tasks('push-v1')]
                env_ids = ['ML1-push-v1']
            elif env_ind == 3:
                envs = [ML1.get_train_tasks('reach-v1')]
                env_ids = ['ML1-reach-v1']
            elif env_ind == 4:
                envs = [ML1.get_train_tasks('pick-place-v1')]
                env_ids = ['ML1-pick-place-v1']
            else:
                raise ValueError("Env index is wrong")
        else:
            if env_ind == 0:
                envs = [HalfCheetahVelEnv]
                env_ids = ['HalfCheetahVelEnv']
            elif env_ind == 1:
                envs = [HalfCheetahDirEnv]
                env_ids = ['HalfCheetahDirEnv']
            else:
                raise ValueError("Env index is wrong")

        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
        benchmark_dir = './data/local/benchmarks/rl2/%s/' % timestamp
        result_json = {}
        for i, env in enumerate(envs):
            seeds = random.sample(range(100), hyper_parameters['n_trials'])
            task_dir = osp.join(benchmark_dir, env_ids[i])
            plt_file = osp.join(benchmark_dir,
                                '{}_benchmark.png'.format(env_ids[i]))
            metarl_tf_csvs = []
            promp_csvs = []

            for trial in range(hyper_parameters['n_trials']):
                seed = seeds[trial]
                trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed)
                metarl_tf_dir = trial_dir + '/metarl'
                promp_dir = trial_dir + '/promp'

                with tf.Graph().as_default():
                    metarl_tf_csv = run_metarl(env, seed, metarl_tf_dir)

                metarl_tf_csvs.append(metarl_tf_csv)

            with open(osp.join(metarl_tf_dir, 'parameters.txt'),
                      'w') as outfile:
                hyper_parameters_copy = copy.deepcopy(hyper_parameters)
                hyper_parameters_copy['sampler_cls'] = str(
                    hyper_parameters_copy['sampler_cls'])
                json.dump(hyper_parameters_copy, outfile)

            g_x = 'TotalEnvSteps'

            if ML:
                g_ys = [
                    'Evaluation/AverageReturn',
                    'Evaluation/SuccessRate',
                ]
            else:
                g_ys = [
                    'Evaluation/AverageReturn',
                ]

            for g_y in g_ys:
                plt_file = osp.join(
                    benchmark_dir,
                    '{}_benchmark_rl2_{}.png'.format(env_ids[i],
                                                     g_y.replace('/', '-')))
                Rh.relplot(g_csvs=metarl_tf_csvs,
                           b_csvs=None,
                           g_x=g_x,
                           g_y=g_y,
                           g_z='MetaRL',
                           b_x=None,
                           b_y=None,
                           b_z=None,
                           trials=hyper_parameters['n_trials'],
                           seeds=seeds,
                           plt_file=plt_file,
                           env_id=env_ids[i],
                           x_label=g_x,
                           y_label=g_y)