def test_entropy(self): """Test get_entropy method of the policy.""" env_spec = TfEnv(DummyBoxEnv()) init_std = 1. obs = torch.Tensor([0, 0, 0, 0]).float() policy = TanhGaussianMLPPolicy(env_spec=env_spec, hidden_sizes=(1, ), init_std=init_std, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) dist = policy(obs) assert torch.allclose(dist.entropy(), policy.entropy(obs))
def sac_half_cheetah_batch(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) trainer = Trainer(snapshot_config=ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=1000, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1000, batch_size=1000)
def sac_setup(env, trainer, args): policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(args.buffer_size)) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, **convert_kwargs(args, SAC)) trainer.setup(algo=sac, env=env, sampler_cls=LocalSampler) return sac
def test_get_action_dict_space(self): """Test if observations from dict obs spaces are properly flattened.""" env = GymEnv(DummyDictEnv(obs_space_type='box', act_space_type='box')) policy = TanhGaussianMLPPolicy(env_spec=env.spec, hidden_nonlinearity=None, hidden_sizes=(1, ), hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) obs = env.reset()[0] action, _ = policy.get_action(obs) assert env.action_space.shape == action.shape actions, _ = policy.get_actions(np.array([obs, obs])) for action in actions: assert env.action_space.shape == action.shape
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=1, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size, fixed_alpha=np.exp(0.5)) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) mtsac.to() assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) trainer.setup(mtsac, env) trainer.train(n_epochs=1, batch_size=128, plot=False) assert torch.allclose(torch.Tensor([0.5] * num_tasks), mtsac._log_alpha.to('cpu')) assert not mtsac._use_automatic_entropy_tuning
def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac.policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def torch_sac_half_cheetah(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = GarageEnv(normalize(gym.make('HalfCheetah-v2'))) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() runner.setup(algo=sac, env=env, sampler_cls=LocalSampler) runner.train(n_epochs=1000, batch_size=1000)
def test_sac_inverted_double_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() ret = trainer.train(n_epochs=12, batch_size=200, plot=False) # check that automatic entropy tuning is used assert sac._use_automatic_entropy_tuning # assert that there was a gradient properly connected to alpha # this doesn't verify that the path from the temperature objective is # correct. assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu')) # check that policy is learning beyond predecided threshold assert ret > 80
def test_get_action_np(self, hidden_sizes): """Test Policy get action function with numpy inputs.""" env_spec = GymEnv(DummyBoxEnv()) obs_dim = env_spec.observation_space.flat_dim act_dim = env_spec.action_space.flat_dim obs = np.ones((obs_dim, ), dtype=np.float32) init_std = 2. policy = TanhGaussianMLPPolicy(env_spec=env_spec, hidden_sizes=hidden_sizes, init_std=init_std, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) expected_mean = torch.full((act_dim, ), 1.0, dtype=torch.float) action, prob = policy.get_action(obs) assert np.allclose(prob['mean'], expected_mean.numpy(), rtol=1e-3) assert action.shape == (act_dim, )
def test_get_actions(self, batch_size, hidden_sizes): """Test Tanh Gaussian Policy get actions function.""" env_spec = GarageEnv(DummyBoxEnv()) obs_dim = env_spec.observation_space.flat_dim act_dim = env_spec.action_space.flat_dim obs = torch.ones([batch_size, obs_dim], dtype=torch.float32) init_std = 2. policy = TanhGaussianMLPPolicy(env_spec=env_spec, hidden_sizes=hidden_sizes, init_std=init_std, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) expected_mean = torch.full([batch_size, act_dim], 1.0) action, prob = policy.get_actions(obs) assert np.allclose(prob['mean'], expected_mean.numpy(), rtol=1e-3) assert action.shape == (batch_size, act_dim)
def test_sac_to(): """Test moving Sac between CPU and GPU.""" env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1, batch_size=100) log_alpha = torch.clone(sac._log_alpha).cpu() set_gpu_mode(False) sac.to() assert torch.allclose(log_alpha, sac._log_alpha)
def test_mtsac_get_log_alpha_incorrect_num_tasks(monkeypatch): """Check that if the num_tasks passed does not match the number of tasks in the environment, then the algorithm should raise an exception. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GymEnv(name, max_episode_length=150) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=None, gradient_steps_per_itr=150, eval_env=[env], env_spec=env.spec, num_tasks=4, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) error_string = ('The number of tasks in the environment does ' 'not match self._num_tasks. Are you sure that you passed ' 'The correct number of tasks?') obs = torch.Tensor([env.reset()[0]] * buffer_batch_size) with pytest.raises(ValueError, match=error_string): mtsac._get_log_alpha(dict(observation=obs))
def test_is_vectorized(self): """Test Tanh Gaussian Policy is vectorized.""" env_spec = TfEnv(DummyBoxEnv()) init_std = 2. policy = TanhGaussianMLPPolicy(env_spec=env_spec, hidden_sizes=(1, ), init_std=init_std, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) assert policy.vectorized
def test_mtsac_get_log_alpha(monkeypatch): """Check that the private function _get_log_alpha functions correctly. MTSAC uses disentangled alphas, meaning that """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) for i, _ in enumerate(env_names): obs = torch.Tensor([env.reset()] * buffer_batch_size) log_alpha = mtsac._get_log_alpha(dict(observation=obs)) assert (log_alpha == torch.Tensor([i + 1, i + 1])).all().item() assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size])
def test_mtsac_inverted_double_pendulum(): """Performance regression test of MTSAC on 2 InvDoublePendulum envs.""" env_names = ['InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v2'] task_envs = [GymEnv(name, max_episode_length=100) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) test_envs = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) trainer = Trainer(snapshot_config=snapshot_config) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 128 sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, eval_env=[test_envs], env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) trainer.setup(mtsac, env) ret = trainer.train(n_epochs=8, batch_size=128, plot=False) assert ret > 0
def test_is_pickleable(self, batch_size, hidden_sizes): """Test if policy is unchanged after pickling.""" env_spec = GymEnv(DummyBoxEnv()) obs_dim = env_spec.observation_space.flat_dim obs = torch.ones([batch_size, obs_dim], dtype=torch.float32) init_std = 2. policy = TanhGaussianMLPPolicy(env_spec=env_spec, hidden_sizes=hidden_sizes, init_std=init_std, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) output1_action, output1_prob = policy.get_actions(obs) p = pickle.dumps(policy) policy_pickled = pickle.loads(p) output2_action, output2_prob = policy_pickled.get_actions(obs) assert np.allclose(output2_prob['mean'], output1_prob['mean'], rtol=1e-3) assert output1_action.shape == output2_action.shape
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=100, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=1, fixed_alpha=np.exp(0.5)) trainer.setup(sac, env) sac.to() trainer.train(n_epochs=1, batch_size=100, plot=False) assert torch.allclose(torch.Tensor([0.5]), sac._log_alpha.cpu()) assert not sac._use_automatic_entropy_tuning
def test_to(self): """Test Tanh Gaussian Policy can be moved to cpu.""" env_spec = GymEnv(DummyBoxEnv()) init_std = 2. policy = TanhGaussianMLPPolicy(env_spec=env_spec, hidden_sizes=(1, ), init_std=init_std, hidden_nonlinearity=None, std_parameterization='exp', hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) if torch.cuda.is_available(): policy.to(torch.device('cuda:0')) assert str(next(policy.parameters()).device) == 'cuda:0' else: policy.to(None) assert str(next(policy.parameters()).device) == 'cpu'
def test_sac_inverted_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) runner.setup(sac, env, sampler_cls=LocalSampler) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() ret = runner.train(n_epochs=12, batch_size=200, plot=False) assert ret > 85
def load_pearl(env_name="CartPole-v0"): """Return an instance of the PEARL algorithm. NOTE: currently not working. """ num_train_tasks = 100 num_test_tasks = 30 latent_size = 5 net_size = 300 encoder_hidden_size = 200 encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # Create multi-task environment and sample tasks. env_start = GarageEnv(env_name=env_name) env_sampler = SetTaskSampler(lambda: GarageEnv(normalize(env_start))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler(lambda: GarageEnv(normalize(env_start))) # Instantiate networks. augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL(env=env, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler) return pearl
def test_pickling(self): """Test pickle and unpickle.""" net_size = 10 env_sampler = SetTaskSampler(PointEnv) env = env_sampler.sample(5) test_env_sampler = SetTaskSampler(PointEnv) augmented_env = PEARL.augment_env_spec(env[0](), 5) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), 5, 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL(env=env, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=5, num_test_tasks=5, latent_dim=5, encoder_hidden_sizes=[10, 10], test_env_sampler=test_env_sampler) # This line is just to improve coverage pearl.to() pickled = pickle.dumps(pearl) unpickled = pickle.loads(pickled) assert hasattr(unpickled, '_replay_buffers') assert hasattr(unpickled, '_context_replay_buffers') assert unpickled._is_resuming
def setup_method(self): """Setup for all test methods.""" self.latent_dim = 5 self.env_spec = GarageEnv(DummyBoxEnv()) latent_space = akro.Box(low=-1, high=1, shape=(self.latent_dim, ), dtype=np.float32) # add latent space to observation space to create a new space augmented_obs_space = akro.Tuple( (self.env_spec.observation_space, latent_space)) augmented_env_spec = EnvSpec(augmented_obs_space, self.env_spec.action_space) self.obs_dim = int(np.prod(self.env_spec.observation_space.shape)) self.action_dim = int(np.prod(self.env_spec.action_space.shape)) reward_dim = 1 self.encoder_input_dim = self.obs_dim + self.action_dim + reward_dim encoder_output_dim = self.latent_dim * 2 encoder_hidden_sizes = (3, 2, encoder_output_dim) context_encoder = MLPEncoder(input_dim=self.encoder_input_dim, output_dim=encoder_output_dim, hidden_nonlinearity=None, hidden_sizes=encoder_hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) context_policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec, hidden_sizes=(3, 5, 7), hidden_nonlinearity=F.relu, output_nonlinearity=None) self.module = ContextConditionedPolicy(latent_dim=self.latent_dim, context_encoder=context_encoder, policy=context_policy, use_information_bottleneck=True, use_next_obs=False)
def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """Train MTSAC with MT10 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). n_tasks (int): Number of tasks to use. Should be a multiple of 10. timesteps (int): Number of timesteps to run. """ deterministic.set_seed(seed) trainer = Trainer(ctxt) mt10 = metaworld.MT10() # pylint: disable=no-member mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): return normalize(env, normalize_reward=True) train_task_sampler = MetaWorldTaskSampler(mt10, 'train', wrap, add_env_onehot=True) test_task_sampler = MetaWorldTaskSampler(mt10_test, 'train', add_env_onehot=True) assert n_tasks % 10 == 0 assert n_tasks <= 500 mt10_train_envs = train_task_sampler.sample(n_tasks) env = mt10_train_envs[0]() mt10_test_envs = [env_up() for env_up in test_task_sampler.sample(n_tasks)] policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) meta_batch_size = 10 sampler = LocalSampler( agents=policy, envs=mt10_train_envs, max_episode_length=env.spec.max_episode_length, # 1 sampler worker for each environment n_workers=meta_batch_size, worker_class=FragmentWorker, # increasing n_envs increases the vectorization of a sampler worker # which improves runtime performance, but you will need to adjust this # depending on your memory constraints. For reference, each worker by # default uses n_envs=8. Each environment is approximately ~50mb large # so creating 50 envs with 8 copies comes out to 20gb of memory. Many # users want to be able to run multiple seeds on 1 machine, so I have # reduced this to n_envs = 2 for 2 copies in the meantime. worker_args=dict(n_envs=2)) batch_size = int(env.spec.max_episode_length * meta_batch_size) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=env.spec.max_episode_length, eval_env=mt10_test_envs, env_spec=env.spec, num_tasks=10, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() trainer.setup(algo=mtsac, env=mt10_train_envs) trainer.train(n_epochs=epochs, batch_size=batch_size)
def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" params = dict(seed=1, num_epochs=1, num_train_tasks=5, latent_size=7, encoder_hidden_sizes=[10, 10, 10], net_size=30, meta_batch_size=16, num_steps_per_epoch=40, num_initial_steps=40, num_tasks_sample=15, num_steps_prior=15, num_extra_rl_steps_posterior=15, batch_size=256, embedding_batch_size=8, embedding_mini_batch_size=8, reward_scale=10., use_information_bottleneck=True, use_next_obs_in_context=False, use_gpu=False) net_size = params['net_size'] set_seed(params['seed']) # create multi-task environment and sample tasks ml1 = metaworld.ML1('push-v1') train_env = MetaWorldSetTaskEnv(ml1, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(params['num_train_tasks']) test_env = MetaWorldSetTaskEnv(ml1, 'test') test_env_sampler = SetTaskSampler( MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) augmented_env = PEARL.augment_env_spec(env[0](), params['latent_size']) qf = ContinuousMLPQFunction( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), params['latent_size'], 'vf') vf = ContinuousMLPQFunction( env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=params['num_train_tasks'], latent_dim=params['latent_size'], encoder_hidden_sizes=params['encoder_hidden_sizes'], test_env_sampler=test_env_sampler, meta_batch_size=params['meta_batch_size'], num_steps_per_epoch=params['num_steps_per_epoch'], num_initial_steps=params['num_initial_steps'], num_tasks_sample=params['num_tasks_sample'], num_steps_prior=params['num_steps_prior'], num_extra_rl_steps_posterior=params[ 'num_extra_rl_steps_posterior'], batch_size=params['batch_size'], embedding_batch_size=params['embedding_batch_size'], embedding_mini_batch_size=params['embedding_mini_batch_size'], reward_scale=params['reward_scale'], ) set_gpu_mode(params['use_gpu'], gpu_id=0) if params['use_gpu']: pearl.to() trainer = Trainer(snapshot_config) trainer.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, n_workers=1, worker_class=PEARLWorker) trainer.train(n_epochs=params['num_epochs'], batch_size=params['batch_size'])
def pearl_metaworld_ml10(ctxt=None, seed=1, num_epochs=1000, num_train_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, reward_scale=10., use_gpu=False): """Train PEARL with ML10 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) ml10 = metaworld.ML10() train_env = MetaWorldSetTaskEnv(ml10, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml10, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) trainer = Trainer(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, reward_scale=reward_scale, ) set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() trainer.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, n_workers=1, worker_class=PEARLWorker) trainer.train(n_epochs=num_epochs, batch_size=batch_size)
def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): """Train MTSAC with the ML1 pick-place-v1 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. _gpu (int): The ID of the gpu to be used (used on multi-gpu machines). """ deterministic.set_seed(seed) runner = LocalRunner(ctxt) train_envs = [] test_envs = [] env_names = [] for i in range(50): train_env = normalize( GymEnv(mwb.ML1.get_train_tasks('pick-place-v1'), normalize_reward=True)) test_env = pickle.loads(pickle.dumps(train_env)) env_names.append('pick_place_{}'.format(i)) train_envs.append(train_env) test_envs.append(test_env) ml1_train_envs = MultiEnvWrapper(train_envs, sample_strategy=round_robin_strategy, env_names=env_names) ml1_test_envs = MultiEnvWrapper(test_envs, sample_strategy=round_robin_strategy, env_names=env_names) policy = TanhGaussianMLPPolicy( env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=ml1_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) timesteps = 10000000 batch_size = int(150 * ml1_train_envs.num_tasks) num_evaluation_points = 500 epochs = timesteps // batch_size epoch_cycles = epochs // num_evaluation_points epochs = epochs // epoch_cycles mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_episode_length=150, eval_env=ml1_test_envs, env_spec=ml1_train_envs.spec, num_tasks=50, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=1500, target_update_tau=5e-3, discount=0.99, buffer_batch_size=1280) if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler) runner.train(n_epochs=epochs, batch_size=batch_size)
def tcl_pearl_ml1(ctxt=None, seed=1, num_epochs=200, num_train_tasks=50, num_test_tasks=10, latent_size=7, encoder_hidden_size=200, net_size=300, meta_batch_size=16, num_steps_per_epoch=4000, num_initial_steps=4000, num_tasks_sample=15, num_steps_prior=750, num_extra_rl_steps_posterior=750, batch_size=256, embedding_batch_size=64, embedding_mini_batch_size=64, max_path_length=200, reward_scale=10., replay_buffer_size=1000000, use_next_obs=False, in_sequence_path_aug=True, emphasized_network=False, use_kl_loss=True, use_q_loss=True, encoder_common_net=True, single_alpha=False, use_task_index_label=False, use_wasserstein_distance=True, gpu_id=0, name='push-v1', prefix='curl_fine_tune', use_gpu=True): """Train TCL-PEARL with ML1 environments. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. num_epochs (int): Number of training epochs. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_size (int): Size of latent context vector. encoder_hidden_size (int): Output dimension of dense layer of the context encoder. net_size (int): Output dimension of a dense layer of Q-function and value function. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. reward_scale (int): Reward scale. use_gpu (bool): Whether or not to use GPU for training. """ set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) print("Running experiences on {}/{}".format(prefix, name)) # create multi-task environment and sample tasks ml1 = metaworld.ML1(name) train_env = MetaWorldSetTaskEnv(ml1, 'train') env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=train_env, wrapper=lambda env, _: normalize(env)) env = env_sampler.sample(num_train_tasks) test_env = MetaWorldSetTaskEnv(ml1, 'test') test_env_sampler = SetTaskSampler(MetaWorldSetTaskEnv, env=test_env, wrapper=lambda env, _: normalize(env)) sampler = LocalSampler(agents=None, envs=env[0](), max_episode_length=max_path_length, n_workers=1, worker_class=TCLPEARLWorker) trainer = Trainer(ctxt) # instantiate networks augmented_env = TCLPEARL.augment_env_spec(env[0](), latent_size) qf_1 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) qf_2 = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) tcl_pearl = TCLPEARL( env=env, policy_class=TCLPolicy, encoder_class=ContrastiveEncoder, inner_policy=inner_policy, qf1=qf_1, qf2=qf_2, sampler=sampler, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, replay_buffer_size=replay_buffer_size, use_next_obs_in_context=use_next_obs, embedding_batch_in_sequence=in_sequence_path_aug, use_kl_loss=use_kl_loss, use_q_loss=use_q_loss, encoder_common_net=encoder_common_net, single_alpha=single_alpha, use_task_index_label=use_task_index_label, use_wasserstein_distance=use_wasserstein_distance) set_gpu_mode(use_gpu, gpu_id=gpu_id) if use_gpu: tcl_pearl.to() trainer.setup(algo=tcl_pearl, env=env[0]()) trainer.train(n_epochs=num_epochs, batch_size=batch_size)
def test_methods(): """Test PEARLWorker methods.""" env_spec = GarageEnv(DummyBoxEnv()) latent_dim = 5 latent_space = akro.Box(low=-1, high=1, shape=(latent_dim, ), dtype=np.float32) # add latent space to observation space to create a new space augmented_obs_space = akro.Tuple( (env_spec.observation_space, latent_space)) augmented_env_spec = EnvSpec(augmented_obs_space, env_spec.action_space) obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) reward_dim = 1 encoder_input_dim = obs_dim + action_dim + reward_dim encoder_output_dim = latent_dim * 2 encoder_hidden_sizes = (3, 2, encoder_output_dim) context_encoder = MLPEncoder(input_dim=encoder_input_dim, output_dim=encoder_output_dim, hidden_nonlinearity=None, hidden_sizes=encoder_hidden_sizes, hidden_w_init=nn.init.ones_, output_w_init=nn.init.ones_) policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec, hidden_sizes=(3, 5, 7), hidden_nonlinearity=F.relu, output_nonlinearity=None) context_policy = ContextConditionedPolicy(latent_dim=latent_dim, context_encoder=context_encoder, policy=policy, use_information_bottleneck=True, use_next_obs=False) max_path_length = 20 worker1 = PEARLWorker(seed=1, max_path_length=max_path_length, worker_number=1) worker1.update_agent(context_policy) worker1.update_env(env_spec) rollouts = worker1.rollout() assert rollouts.observations.shape == (max_path_length, obs_dim) assert rollouts.actions.shape == (max_path_length, action_dim) assert rollouts.rewards.shape == (max_path_length, ) worker2 = PEARLWorker(seed=1, max_path_length=max_path_length, worker_number=1, deterministic=True, accum_context=True) worker2.update_agent(context_policy) worker2.update_env(env_spec) rollouts = worker2.rollout() assert context_policy.context.shape == (1, max_path_length, encoder_input_dim) assert rollouts.observations.shape == (max_path_length, obs_dim) assert rollouts.actions.shape == (max_path_length, action_dim) assert rollouts.rewards.shape == (max_path_length, )
def pearl_half_cheetah( ctxt=None, seed=1, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, reward_scale=param_reward_scale, use_gpu=param_use_gpu): set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) env = env_sampler.sample(num_train_tasks) test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns
def diayn_pearl_half_cheeth( ctxt=None, seed=1, num_epochs=param_num_epoches, num_train_tasks=param_train_tasks_num, num_test_tasks=param_test_tasks_num, latent_size=param_latent_size, encoder_hidden_size=param_encoder_hidden_size, net_size=param_net_size, meta_batch_size=param_meta_batch_size, num_steps_per_epoch=param_num_steps_per_epoch, num_initial_steps=param_num_initial_steps, num_tasks_sample=param_num_tasks_sample, num_steps_prior=param_num_steps_prior, num_extra_rl_steps_posterior=param_num_extra_rl_steps_posterior, batch_size=param_batch_size, embedding_batch_size=param_embedding_batch_size, embedding_mini_batch_size=param_embedding_mini_batch_size, max_path_length=param_max_path_length, reward_scale=param_reward_scale, use_gpu=param_use_gpu): if task_proposer is None: raise ValueError("Task proposer is empty") assert num_train_tasks is skills_num set_seed(seed) encoder_hidden_sizes = (encoder_hidden_size, encoder_hidden_size, encoder_hidden_size) # create multi-task environment and sample tasks ML_train_envs = [ DiaynEnvWrapper(task_proposer, skills_num, task_name, normalize(HalfCheetahVelEnv())) for task_name in range(skills_num) ] env_sampler = EnvPoolSampler(ML_train_envs) env = env_sampler.sample(num_train_tasks) # train_trajs_dist = [train_env.get_training_traj(diayn_trained_agent) # for train_env in ML_train_envs] # ML_test_envs = [ # GarageEnv(normalize( # DiaynEnvWrapper(env, task_proposer, skills_num, task_name))) # for task_name in random.sample(range(skills_num), test_tasks_num) # ] test_env_sampler = SetTaskSampler( lambda: GarageEnv(normalize(HalfCheetahVelEnv()))) runner = LocalRunner(ctxt) # instantiate networks augmented_env = PEARL.augment_env_spec(env[0](), latent_size) qf = ContinuousMLPQFunction(env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) vf_env = PEARL.get_env_spec(env[0](), latent_size, 'vf') vf = ContinuousMLPQFunction(env_spec=vf_env, hidden_sizes=[net_size, net_size, net_size]) inner_policy = TanhGaussianMLPPolicy( env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size]) pearl = PEARL( env=env, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, inner_policy=inner_policy, qf=qf, vf=vf, num_train_tasks=num_train_tasks, num_test_tasks=num_test_tasks, latent_dim=latent_size, encoder_hidden_sizes=encoder_hidden_sizes, test_env_sampler=test_env_sampler, meta_batch_size=meta_batch_size, num_steps_per_epoch=num_steps_per_epoch, num_initial_steps=num_initial_steps, num_tasks_sample=num_tasks_sample, num_steps_prior=num_steps_prior, num_extra_rl_steps_posterior=num_extra_rl_steps_posterior, batch_size=batch_size, embedding_batch_size=embedding_batch_size, embedding_mini_batch_size=embedding_mini_batch_size, max_path_length=max_path_length, reward_scale=reward_scale, ) tu.set_gpu_mode(use_gpu, gpu_id=0) if use_gpu: pearl.to() runner.setup(algo=pearl, env=env[0](), sampler_cls=LocalSampler, sampler_args=dict(max_path_length=max_path_length), n_workers=1, worker_class=PEARLWorker) average_returns = runner.train(n_epochs=num_epochs, batch_size=batch_size) runner.save(num_epochs - 1) return average_returns