def rl2_ppo_ml1(ctxt, seed, max_path_length, meta_batch_size, n_epochs, episode_per_task): """Train PPO with ML1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. max_path_length (int): Maximum length of a single rollout. meta_batch_size (int): Meta batch size. n_epochs (int): Total number of epochs for training. episode_per_task (int): Number of training episode per task. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: tasks = task_sampler.SetTaskSampler( lambda: RL2Env(env=ML1.get_train_tasks('push-v1'))) env_spec = RL2Env(env=ML1.get_train_tasks('push-v1')).spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) algo = RL2PPO(rl2_max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks, env_spec=env_spec, policy=policy, baseline=baseline, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, max_path_length=max_path_length * episode_per_task) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker, worker_args=dict(n_paths_per_trial=episode_per_task)) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def _prepare_meta_env(env): if ML: if env_ind == 2: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('push-v1'), random_init=False)) elif env_ind == 3: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('reach-v1'), random_init=False)) elif env_ind == 4: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(ML1.get_train_tasks('pick-place-v1'), random_init=False)) else: task_samplers = task_sampler.SetTaskSampler(lambda: RL2Env(env())) return task_samplers.sample(1)[0](), task_samplers
def test_pickling(task): if task in HARD_MODE_CLS_DICT['train']: env = ML1.get_train_tasks(task) else: env = ML1.get_test_tasks(task) env2 = pkl.loads(pkl.dumps(env)) assert len(env._task_names) == 1 assert len(env._task_names) == len(env2._task_names) assert env._task_names[0] == env2._task_names[0] np.testing.assert_equal(env._discrete_goals, env2._discrete_goals)
def test_benchmark_pearl(self): ''' Compare benchmarks between metarl and baselines. :return: ''' env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('reach-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('reach-v1')))) test_env = test_env_sampler.sample(params['num_train_tasks']) env_id = 'reach-v1' timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = osp.join(os.getcwd(), 'data', 'local', 'benchmarks', 'pearl', timestamp) result_json = {} seeds = random.sample(range(100), params['n_trials']) task_dir = osp.join(benchmark_dir, env_id) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_id)) metarl_csvs = [] for trial in range(params['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_dir = trial_dir + '/metarl' metarl_csv = run_metarl(env, test_env, seed, metarl_dir) metarl_csvs.append(metarl_csv) env.close() benchmark_helper.plot_average_over_trials( [metarl_csvs], ys=['Test/Average/SuccessRate'], plt_file=plt_file, env_id=env_id, x_label='TotalEnvSteps', y_label='Test/Average/SuccessRate', names=['metarl_pearl'], ) factor_val = params['meta_batch_size'] * params['max_path_length'] result_json[env_id] = benchmark_helper.create_json( [metarl_csvs], seeds=seeds, trials=params['n_trials'], xs=['TotalEnvSteps'], ys=['Test/Average/SuccessRate'], factors=[factor_val], names=['metarl_pearl']) Rh.write_file(result_json, 'PEARL')
def maml_trpo(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = GarageEnv( normalize(ML1.get_train_tasks('push-v1'), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 test_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1')))) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_path_length=max_path_length) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def run_task(args,*_): #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO metaworld_env = ML1.get_train_tasks("pick-place-v1") tasks = metaworld_env.sample_tasks(1) metaworld_env.set_task(tasks[0]) metaworld_env._observation_space = convert_gym_space(metaworld_env.observation_space) metaworld_env._action_space = convert_gym_space(metaworld_env.action_space) env = TfEnv(normalize(metaworld_env)) policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, #batch_size=50000, batch_size=100, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
def test_all_ml1(name): train_env = ML1.get_train_tasks(name) tasks = train_env.sample_tasks(11) for t in tasks: train_env.set_task(t) step_env(train_env, max_path_length=3) train_env.close() del train_env test_env = ML1.get_test_tasks(name) tasks = test_env.sample_tasks(11) for t in tasks: test_env.set_task(t) step_env(test_env, max_path_length=3) test_env.close() del test_env
def __init__(self, max_episode_steps=150, out_of_distribution=False, n_train_tasks=50, n_test_tasks=10, **kwargs): super(ReachML1Env, self).__init__() self.train_env = ML1.get_train_tasks( 'reach-v1', out_of_distribution=out_of_distribution) self.test_env = ML1.get_test_tasks( 'reach-v1', out_of_distribution=out_of_distribution) self.train_tasks = self.train_env.sample_tasks(n_train_tasks) self.test_tasks = self.test_env.sample_tasks(n_test_tasks) self.tasks = self.train_tasks + self.test_tasks self.env = self.train_env #this env will change depending on the idx self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.goal_space_origin = np.array([0, 0.85, 0.175]) self.current_task_idx = 0 self.episode_steps = 0 self._max_episode_steps = max_episode_steps
def test_ml1_random_init(task_name): env = ML1.get_train_tasks(task_name) tasks = env.sample_tasks(1) env.set_task(tasks[0]) actual_goal = env.active_env.goal _ = env.reset() _, _, _, info_0 = env.step(env.action_space.sample()) _ = env.reset() _, _, _, info_1 = env.step(env.action_space.sample()) assert np.all(info_0['goal'] == info_1['goal']) assert np.all(info_0['goal'] == actual_goal)
def run_task(args, *_): #env = TfEnv(normalize(dnc_envs.create_stochastic('pick'))) # Cannot be solved easily by TRPO metaworld_env = ML1.get_train_tasks( 'pick-place-v1') # Create an environment with task `pick_place` tasks = metaworld_env.sample_tasks( 1) # Sample a task (in this case, a goal variation) metaworld_env.set_task(tasks[0]) # Set task # print(metaworld_env.id) # print("HERE") # import pdb;pdb.set_trace() metaworld_env = GymEnv2(metaworld_env) # metaworld_env.observation_space = convert_gym_space(metaworld_env.observation_space) # metaworld_env.action_space = convert_gym_space(metaworld_env.action_space) # env = metaworld_env env = TfEnv(normalize(metaworld_env)) # Cannot be solved easily by TRPO policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, min_std=1e-2, hidden_sizes=(150, 100, 50), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1, force_batch_sampler=True, max_path_length=50, discount=1, step_size=0.02, ) algo.train()
print(type(env)) max_path_length = 64 elif args.env == 'Walker': env = TfEnv(normalize(WalkerEnv_sparse())) print(type(env)) max_path_length = 64 elif args.env == 'Ant': env = TfEnv(normalize(AntGoalEnvSparse())) print(type(env)) max_path_length = 64 elif args.env == 'Cheetah': env = TfEnv(normalize(HalfCheetahVelEnv_sparse())) print(type(env)) max_path_length = 64 elif args.env == 'Push': env = TfEnv(normalize(ML1.get_train_tasks('push-v1'))) max_path_length = 150 elif args.env == 'Reach': env = TfEnv(normalize(ML1.get_train_tasks('reach-v1'))) max_path_length = 150 else: raise AssertionError('Not Implemented') ######################################################## #####################Algo Selection#################### if args.algo == 'Maesn': assert v[ 'fast_learning_rate'] != 0, 'Fast learning rate needs to be non 0 for Maesn' policy = adaGaussPolicy(
def test_benchmark_rl2(self): # pylint: disable=no-self-use """Compare benchmarks between metarl and baselines.""" if ML: if env_ind == 2: envs = [ML1.get_train_tasks('push-v1')] env_ids = ['ML1-push-v1'] elif env_ind == 3: envs = [ML1.get_train_tasks('reach-v1')] env_ids = ['ML1-reach-v1'] elif env_ind == 4: envs = [ML1.get_train_tasks('pick-place-v1')] env_ids = ['ML1-pick-place-v1'] else: raise ValueError("Env index is wrong") else: if env_ind == 0: envs = [HalfCheetahVelEnv] env_ids = ['HalfCheetahVelEnv'] elif env_ind == 1: envs = [HalfCheetahDirEnv] env_ids = ['HalfCheetahDirEnv'] else: raise ValueError("Env index is wrong") timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = './data/local/benchmarks/rl2/%s/' % timestamp result_json = {} for i, env in enumerate(envs): seeds = random.sample(range(100), hyper_parameters['n_trials']) task_dir = osp.join(benchmark_dir, env_ids[i]) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_ids[i])) metarl_tf_csvs = [] promp_csvs = [] for trial in range(hyper_parameters['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) promp_dir = trial_dir + '/promp' with tf.Graph().as_default(): if isinstance(env, gym.Env): env.reset() promp_csv = run_promp(env, seed, promp_dir) else: promp_csv = run_promp(env(), seed, promp_dir) promp_csvs.append(promp_csv) with open(osp.join(promp_dir, 'parameters.txt'), 'w') as outfile: json.dump(hyper_parameters, outfile) if isinstance(env, gym.Env): env.close() p_x = 'n_timesteps' if ML: p_ys = ['train-AverageReturn', 'train-SuccessRate'] else: p_ys = ['train-AverageReturn'] for p_y in p_ys: plt_file = osp.join( benchmark_dir, '{}_benchmark_promp_{}.png'.format(env_ids[i], p_y.replace('/', '-'))) Rh.relplot(g_csvs=promp_csvs, b_csvs=None, g_x=p_x, g_y=p_y, g_z='ProMP', b_x=None, b_y=None, b_z='None', trials=hyper_parameters['n_trials'], seeds=seeds, plt_file=plt_file, env_id=env_ids[i])
def te_ppo_ml1_push(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~LocalRunner` to create the :class:`~Snapshotter`. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) envs = [ normalize( GymEnv(ML1.get_train_tasks('push-v1'), max_episode_length=150)) ] env = MultiEnvWrapper(envs, mode='del-onehot') latent_length = 2 inference_window = 6 batch_size = batch_size_per_task policy_ent_coeff = 2e-2 encoder_ent_coeff = 2e-4 inference_ce_coeff = 5e-2 embedding_init_std = 0.1 embedding_max_std = 0.2 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None with LocalTFRunner(snapshot_config=ctxt) as runner: task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=(20, 20), std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=(20, 10), std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=(32, 16), std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_optimization_epochs=10, ), center_adv=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)
from metaworld.benchmarks import ML1 import time print(ML1.available_tasks()) # Check out the available tasks env = ML1.get_train_tasks( 'pick-place-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) env.set_task(tasks[0]) # Set task obs = env.reset() # Reset environment for i in range(1000): print('iteration %d' % (i)) if i % 100 == 0: obs = env.reset() env.render() a = env.action_space.sample() # Sample an action obs, reward, done, info = env.step(a) print(obs) # Step the environoment with the sampled random action time.sleep(0.2)
def torch_pearl_ml1_push(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=150, reward_scale=10., use_gpu=False): """Train PEARL with ML1 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(ML1.get_test_tasks('push-v1')))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) worker_args = dict(deterministic=True, accum_context=True) meta_evaluator = MetaEvaluator(test_task_sampler=test_env_sampler, max_path_length=max_path_length, worker_class=PEARLWorker, worker_args=worker_args, n_test_tasks=num_test_tasks) pearl.evaluator = meta_evaluator runner.train(n_epochs=num_epochs, batch_size=batch_size)
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dm_control2gym.make(domain_name=domain, task_name=task) elif env_id.startswith('metaworld_'): world_bench = env_id.split('_')[1] if world_bench.startswith('ml1.'): world_task = world_bench.split('.')[1] env = ML1.get_train_tasks(world_task) elif world_bench == 'ml10': env = ML10.get_train_tasks() elif world_bench == 'mt10': env = MT10.get_train_tasks() else: raise 'This code only supports metaworld ml1, ml10 or mt10.' env = MetaworldWrapper(env) else: env = gym.make(env_id) if obs_keys is not None: env = gym.wrappers.FlattenDictWrapper(env, dict_keys=obs_keys) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape # if str(env.__class__.__name__).find('TimeLimit') >= 0: # env = TimeLimitMask(env) if log_dir is not None: if save_video: env = bench.Monitor(env, os.path.join(log_dir + '/eval/monitor', str(rank)), allow_early_resets=allow_early_resets) env = gym.wrappers.Monitor(env, os.path.join( log_dir + '/eval/video', str(rank)), force=True) else: env = bench.Monitor(env, os.path.join(log_dir + '/monitor', str(rank)), allow_early_resets=allow_early_resets) if is_atari: if len(env.observation_space.shape) == 3: env = wrap_deepmind(env) elif len(env.observation_space.shape) == 3: raise NotImplementedError( "CNN models work only for atari,\n" "please use a custom wrapper for a custom pixel input env.\n" "See wrap_deepmind for an example.") # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env, op=[2, 0, 1]) return env
def rl2_ppo_halfcheetah(ctxt=None, seed=1): """Train PPO with HalfCheetah environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: max_path_length = 100 meta_batch_size = 10 n_epochs = 50 episode_per_task = 4 # ---- For ML1-push from metaworld.benchmarks import ML1 tasks = task_sampler.SetTaskSampler(lambda: RL2Env( env=ML1.get_train_tasks('push-v1'))) # ---- For HalfCheetahVel # tasks = task_sampler.SetTaskSampler(lambda: RL2Env( # env=HalfCheetahVelEnv())) env_spec = tasks.sample(1)[0]().spec policy = GaussianGRUPolicy(name='policy', hidden_dim=64, env_spec=env_spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env_spec) inner_algo = RL2PPO( env_spec=env_spec, policy=policy, baseline=baseline, max_path_length=max_path_length * episode_per_task, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) algo = RL2(policy=policy, inner_algo=inner_algo, max_path_length=max_path_length, meta_batch_size=meta_batch_size, task_sampler=tasks) runner.setup(algo, tasks.sample(meta_batch_size), sampler_cls=LocalSampler, n_workers=meta_batch_size, worker_class=RL2Worker) runner.train(n_epochs=n_epochs, batch_size=episode_per_task * max_path_length * meta_batch_size)
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, num_test_tasks=1, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, max_path_length=50, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1')))) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() runner = LocalRunner(snapshot_config) runner.setup( algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=params['max_path_length']), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
import pytest from metaworld.benchmarks import ML1, MT10, ML10, ML45, MT50 from tests.helpers import step_env @pytest.mark.parametrize('name', ML1.available_tasks()) def test_all_ml1(name): train_env = ML1.get_train_tasks(name) tasks = train_env.sample_tasks(11) for t in tasks: train_env.set_task(t) step_env(train_env, max_path_length=3) train_env.close() del train_env test_env = ML1.get_test_tasks(name) tasks = test_env.sample_tasks(11) for t in tasks: test_env.set_task(t) step_env(test_env, max_path_length=3) test_env.close() del test_env def test_all_ml10(): ml10_train_env = ML10.get_train_tasks() train_tasks = ml10_train_env.sample_tasks(11) for t in train_tasks:
def sac(args, steps_per_epoch=1500, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=3e-4, batch_size=128, start_steps=1000, update_after=1000, update_every=1, num_test_episodes=10, max_ep_len=150, logger_kwargs=dict(), save_freq=1): logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) torch.set_num_threads(torch.get_num_threads()) actor_critic = core.MLPActorCritic ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) gamma = args.gamma seed = args.seed epochs = args.epochs logger_tensor = Logger(logdir=args.logdir, run_name="{}-{}".format(args.model_name, time.ctime())) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) env.set_task(tasks[0]) # Set task test_env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) test_env.set_task(tasks[0]) # Set task obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=3e-4) q_optimizer = Adam(q_params, lr=3e-4) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, logger_tensor, t): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) logger_tensor.log_value(t, loss_q.item(), "loss q") # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) logger_tensor.log_value(t, loss_pi.item(), "loss pi") # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) logger_tensor.log_value(t, ep_ret, "test ep reward") logger_tensor.log_value(t, ep_len, "test ep length") # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger_tensor.log_value(t, ep_ret, "reward") logging.info("> total_steps={} | reward={}".format(t, ep_ret)) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, logger_tensor = logger_tensor, t = t) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger_tensor.log_value(t, epoch, "epoch") logger.dump_tabular(logger_tensor=logger_tensor,epoch = epoch) ac.save(args.save_model_dir, args.model_name)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1')))) test_env = test_env_sampler.sample(params['num_train_tasks']) runner = LocalRunner(snapshot_config) obs_dim = int(np.prod(env[0]().observation_space.shape)) action_dim = int(np.prod(env[0]().action_space.shape)) reward_dim = 1 # instantiate networks encoder_in_dim = obs_dim + action_dim + reward_dim encoder_out_dim = params['latent_size'] * 2 net_size = params['net_size'] context_encoder = MLPEncoder(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=[200, 200, 200]) space_a = akro.Box(low=-1, high=1, shape=(obs_dim + params['latent_size'], ), dtype=np.float32) space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) augmented_env = EnvSpec(space_a, space_b) qf1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32) action_space = akro.Box(low=-1, high=1, shape=(params['latent_size'], ), dtype=np.float32) vf_env = EnvSpec(obs_space, action_space) vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) policy = TanhGaussianMLPPolicy2( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) context_conditioned_policy = ContextConditionedPolicy( latent_dim=params['latent_size'], context_encoder=context_encoder, policy=policy, use_ib=params['use_information_bottleneck'], use_next_obs=params['use_next_obs_in_context'], ) pearlsac = PEARLSAC( env=env, test_env=test_env, policy=context_conditioned_policy, qf1=qf1, qf2=qf2, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'], num_evals=params['num_evals'], num_steps_per_eval=params['num_steps_per_eval'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) tu.set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearlsac.to() runner.setup(algo=pearlsac, env=env, sampler_cls=PEARLSampler, sampler_args=dict(max_path_length=params['max_path_length'])) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def test_benchmark_rl2(self): # pylint: disable=no-self-use """Compare benchmarks between metarl and baselines.""" if ML: if env_ind == 2: envs = [ML1.get_train_tasks('push-v1')] env_ids = ['ML1-push-v1'] elif env_ind == 3: envs = [ML1.get_train_tasks('reach-v1')] env_ids = ['ML1-reach-v1'] elif env_ind == 4: envs = [ML1.get_train_tasks('pick-place-v1')] env_ids = ['ML1-pick-place-v1'] else: raise ValueError("Env index is wrong") else: if env_ind == 0: envs = [HalfCheetahVelEnv] env_ids = ['HalfCheetahVelEnv'] elif env_ind == 1: envs = [HalfCheetahDirEnv] env_ids = ['HalfCheetahDirEnv'] else: raise ValueError("Env index is wrong") timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') benchmark_dir = './data/local/benchmarks/rl2/%s/' % timestamp result_json = {} for i, env in enumerate(envs): seeds = random.sample(range(100), hyper_parameters['n_trials']) task_dir = osp.join(benchmark_dir, env_ids[i]) plt_file = osp.join(benchmark_dir, '{}_benchmark.png'.format(env_ids[i])) metarl_tf_csvs = [] promp_csvs = [] for trial in range(hyper_parameters['n_trials']): seed = seeds[trial] trial_dir = task_dir + '/trial_%d_seed_%d' % (trial + 1, seed) metarl_tf_dir = trial_dir + '/metarl' promp_dir = trial_dir + '/promp' with tf.Graph().as_default(): metarl_tf_csv = run_metarl(env, seed, metarl_tf_dir) metarl_tf_csvs.append(metarl_tf_csv) with open(osp.join(metarl_tf_dir, 'parameters.txt'), 'w') as outfile: hyper_parameters_copy = copy.deepcopy(hyper_parameters) hyper_parameters_copy['sampler_cls'] = str( hyper_parameters_copy['sampler_cls']) json.dump(hyper_parameters_copy, outfile) g_x = 'TotalEnvSteps' if ML: g_ys = [ 'Evaluation/AverageReturn', 'Evaluation/SuccessRate', ] else: g_ys = [ 'Evaluation/AverageReturn', ] for g_y in g_ys: plt_file = osp.join( benchmark_dir, '{}_benchmark_rl2_{}.png'.format(env_ids[i], g_y.replace('/', '-'))) Rh.relplot(g_csvs=metarl_tf_csvs, b_csvs=None, g_x=g_x, g_y=g_y, g_z='MetaRL', b_x=None, b_y=None, b_z=None, trials=hyper_parameters['n_trials'], seeds=seeds, plt_file=plt_file, env_id=env_ids[i], x_label=g_x, y_label=g_y)