def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [MetaRLEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [MetaRLEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) runner = LocalRunner(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, eval_env=test_envs, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=1, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size, fixed_alpha=np.exp(0.5)) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) mtsac.to() assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) runner.setup(mtsac, env, sampler_cls=LocalSampler) runner.train(n_epochs=1, batch_size=128, plot=False) assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) assert not mtsac._use_automatic_entropy_tuning
def sac_half_cheetah_batch(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = MetaRLEnv(normalize(gym.make('HalfCheetah-v2'))) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() runner.setup(algo=sac, env=env, sampler_cls=LocalSampler) runner.train(n_epochs=1000, batch_size=1000)
def test_utils_set_gpu_mode(): """Test setting gpu mode to False to force CPU.""" if torch.cuda.is_available(): set_gpu_mode(mode=True) assert global_device() == torch.device('cuda:0') assert tu._USE_GPU else: set_gpu_mode(mode=False) assert global_device() == torch.device('cpu') assert not tu._USE_GPU assert not tu._GPU_ID
def test_sac_inverted_double_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) runner.setup(sac, env, sampler_cls=LocalSampler) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() ret = runner.train(n_epochs=12, batch_size=200, plot=False) # check that automatic entropy tuning is used assert sac._use_automatic_entropy_tuning # assert that there was a gradient properly connected to alpha # this doesn't verify that the path from the temperature objective is # correct. assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu')) # check that policy is learning beyond predecided threshold assert ret > 85
def pearl_metaworld_ml1_push(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=150, reward_scale=10., use_gpu=False): """Train PEARL with ML1 environments. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler(lambda: MetaRLEnv( normalize(mwb.ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler(lambda: MetaRLEnv( normalize(mwb.ML1.get_test_tasks('push-v1')))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=num_epochs, batch_size=batch_size)
def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): """Train MTSAC with MT50 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. use_gpu (bool): Used to enable ussage of GPU in training. _gpu (int): The ID of the gpu (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) task_names = mwb.MT50.get_train_tasks().all_task_names train_envs = [] test_envs = [] for task_name in task_names: train_env = normalize(MetaRLEnv(mwb.MT50.from_task(task_name)), normalize_reward=True) test_env = normalize(MetaRLEnv(mwb.MT50.from_task(task_name))) train_envs.append(train_env) test_envs.append(test_env) mt50_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, mode='vanilla') mt50_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, mode='vanilla') policy = TanhGaussianMLPPolicy( env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=mt50_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 100000000 batch_size = int(150 * mt50_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=250, eval_env=mt50_test_envs, env_spec=mt50_train_envs.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=7500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=6400) set_gpu_mode(use_gpu, _gpu) mtsac.to() runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, num_test_tasks=1, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, max_path_length=50, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_train_tasks('push-v1')))) env = env_sampler.sample(params['num_train_tasks']) test_env_sampler = SetTaskSampler( lambda: MetaRLEnv(normalize(ML1.get_test_tasks('push-v1')))) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], num_test_tasks=params['num_test_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], max_path_length=params['max_path_length'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() runner = LocalRunner(snapshot_config) runner.setup( algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=params['max_path_length']), n_workers=1, worker_class=PEARLWorker) runner.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): """Train MTSAC with the ML1 pick-place-v1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) train_envs = [] test_envs = [] env_names = [] for i in range(50): train_env = MetaRLEnv( normalize(mwb.ML1.get_train_tasks('pick-place-v1'), normalize_reward=True)) test_env = pickle.loads(pickle.dumps(train_env)) env_names.append('pick_place_{}'.format(i)) train_envs.append(train_env) test_envs.append(test_env) ml1_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, env_names=env_names) ml1_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, env_names=env_names) policy = TanhGaussianMLPPolicy( env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 10000000 batch_size = int(150 * ml1_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=ml1_test_envs, env_spec=ml1_train_envs.spec, num_tasks=50, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)