def testActorLoss(): """Test Sac Actor/Policy loss.""" # pylint: disable=no-member policy = DummyActorPolicy() sac = SAC(env_spec=None, policy=policy, qf1=DummyCriticNet(), qf2=DummyCriticNet(), replay_buffer=None, discount=1, buffer_batch_size=2, target_entropy=3.0, initial_log_entropy=0, optimizer=MagicMock, max_path_length=10, gradient_steps_per_itr=1) observations = torch.Tensor([[1., 2.], [3., 4.]]) action_dists = policy(observations) actions = torch.Tensor(action_dists.rsample_with_pre_tanh_value()) samples_data = dict(observation=observations) log_pi = action_dists.log_prob(actions) expected_loss = (2 * 10 - (2 + 1) - (4 + 1)) / 2 loss = sac._actor_objective(samples_data, actions, log_pi) assert np.all(np.isclose(loss, expected_loss))
def testTemperatureLoss(): """Test Sac temperature loss.""" # pylint: disable=no-member policy = DummyActorPolicy() policy = DummyActorPolicy() spec = MagicMock sac = SAC(env_spec=spec, policy=policy, qf1=DummyCriticNet(), qf2=DummyCriticNet(), replay_buffer=None, discount=1, buffer_batch_size=2, target_entropy=3.0, initial_log_entropy=4.0, optimizer=MagicMock, gradient_steps_per_itr=1) observations = torch.Tensor([[1., 2.], [3., 4.]]) action_dists = policy(observations)[0] actions = action_dists.rsample_with_pre_tanh_value() log_pi = action_dists.log_prob(actions) samples_data = dict(observation=observations, action=actions) expected_loss = 4.0 * (-10 - 3) loss = sac._temperature_objective(log_pi, samples_data).item() assert np.all(np.isclose(loss, expected_loss))
def sac_half_cheetah_batch(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) trainer = Trainer(snapshot_config=ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=1000, max_episode_length_eval=1000, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1000, batch_size=1000)
def torch_sac_half_cheetah(ctxt=None, seed=1): """Set up environment and algorithm and run the task. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ deterministic.set_seed(seed) runner = LocalRunner(snapshot_config=ctxt) env = GarageEnv(normalize(gym.make('HalfCheetah-v2'))) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, discount=0.99, buffer_batch_size=256, reward_scale=1., steps_per_epoch=1) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() runner.setup(algo=sac, env=env, sampler_cls=LocalSampler) runner.train(n_epochs=1000, batch_size=1000)
def test_sac_inverted_double_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() ret = trainer.train(n_epochs=12, batch_size=200, plot=False) # check that automatic entropy tuning is used assert sac._use_automatic_entropy_tuning # assert that there was a gradient properly connected to alpha # this doesn't verify that the path from the temperature objective is # correct. assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu')) # check that policy is learning beyond predecided threshold assert ret > 80
def sac_setup(env, trainer, args): policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[args.hidden_dim] * args.depth, hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(args.buffer_size)) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, **convert_kwargs(args, SAC)) trainer.setup(algo=sac, env=env, sampler_cls=LocalSampler) return sac
def test_fixed_alpha(): """Test if using fixed_alpha ensures that alpha is non differentiable.""" # pylint: disable=unexpected-keyword-arg env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=100, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=1, fixed_alpha=np.exp(0.5)) trainer.setup(sac, env) sac.to() trainer.train(n_epochs=1, batch_size=100, plot=False) assert torch.allclose(torch.Tensor([0.5]), sac._log_alpha.cpu()) assert not sac._use_automatic_entropy_tuning
def test_sac_inverted_pendulum(): """Test Sac performance on inverted pendulum.""" # pylint: disable=unexpected-keyword-arg env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=1) runner = LocalRunner(snapshot_config=snapshot_config) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=100, max_path_length=100, use_automatic_entropy_tuning=True, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) runner.setup(sac, env, sampler_cls=LocalSampler) if torch.cuda.is_available(): tu.set_gpu_mode(True) else: tu.set_gpu_mode(False) sac.to() ret = runner.train(n_epochs=12, batch_size=200, plot=False) assert ret > 85
def testCriticLoss(): """Test Sac Critic/QF loss.""" # pylint: disable=no-member policy = DummyActorPolicy() spec = MagicMock spec.max_episode_length = 100 sac = SAC(env_spec=spec, policy=policy, qf1=DummyCriticNet(), qf2=DummyCriticNet(), sampler=None, replay_buffer=None, gradient_steps_per_itr=1, discount=0.9, buffer_batch_size=2, target_entropy=3.0, optimizer=MagicMock) observations = torch.FloatTensor([[1, 2], [3, 4]]) actions = torch.FloatTensor([[5], [6]]) rewards = torch.FloatTensor([10, 20]) terminals = torch.Tensor([[0.], [0.]]) next_observations = torch.FloatTensor([[5, 6], [7, 8]]) samples_data = { 'observation': observations, 'action': actions, 'reward': rewards, 'terminal': terminals, 'next_observation': next_observations } td_targets = [7.3, 19.1] pred_td_targets = [7., 10.] # Expected critic loss has factor of 2, for the two TD3 critics. expected_loss = 2 * F.mse_loss(torch.Tensor(td_targets), torch.Tensor(pred_td_targets)) loss = sac._critic_objective(samples_data) assert np.all(np.isclose(np.sum(loss), expected_loss))
def test_sac_to(): """Test moving Sac between CPU and GPU.""" env = normalize(GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[32, 32], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) trainer = Trainer(snapshot_config=snapshot_config) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) sac = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, sampler=sampler, gradient_steps_per_itr=100, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=64, reward_scale=1., steps_per_epoch=2) trainer.setup(sac, env) if torch.cuda.is_available(): set_gpu_mode(True) else: set_gpu_mode(False) sac.to() trainer.setup(algo=sac, env=env) trainer.train(n_epochs=1, batch_size=100) log_alpha = torch.clone(sac._log_alpha).cpu() set_gpu_mode(False) sac.to() assert torch.allclose(log_alpha, sac._log_alpha)
def load_sac(env_name="MountainCarContinuous-v0"): """Return an instance of the SAC algorithm.""" env = GarageEnv(env_name=env_name) policy = DeterministicMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=[64, 64]) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = SAC(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, max_path_length=500, replay_buffer=replay_buffer) return algo