def test_decay_period(env): policy = ConstantPolicy(env.action_space.sample()) exp_policy = AddGaussianNoise(env, policy, total_timesteps=2, max_sigma=1., min_sigma=0.) assert (exp_policy.get_action(None)[0] != policy.get_action(None)[0]).all() assert (exp_policy.get_action(None)[0] != policy.get_action(None)[0]).all() assert (exp_policy.get_action(None)[0] == policy.get_action(None)[0]).all()
def td3_pendulum(ctxt=None, seed=1): """Wrap TD3 training task in the run_task function. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise(env.spec, policy, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) td3 = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=1e-2, steps_per_epoch=20, n_train_steps=1, smooth_return=False, discount=0.99, buffer_batch_size=100, min_buffer_size=1e4, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(td3, env) runner.train(n_epochs=500, batch_size=250)
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with TFTrainer(snapshot_config) as trainer: n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 250 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, steps_per_epoch=steps_per_epoch, target_update_tau=0.005, n_train_steps=50, discount=0.99, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(algo, env, sampler_cls=LocalSampler) last_avg_ret = trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 200
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise(env.spec, policy, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=0.005, n_train_steps=50, discount=0.99, smooth_return=False, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=250) assert last_avg_ret > 400
def test_params(env): policy1 = ConstantPolicy(env.action_space.sample()) policy2 = ConstantPolicy(env.action_space.sample()) assert (policy1.get_action(None)[0] != policy2.get_action(None)[0]).all() exp_policy1 = AddGaussianNoise(env, policy1) exp_policy2 = AddGaussianNoise(env, policy2) exp_policy1.set_param_values(exp_policy2.get_param_values()) assert (policy1.get_action(None)[0] == policy2.get_action(None)[0]).all()
def test_pickling(self): """Test pickle and unpickle.""" deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() pickled = pickle.dumps(td3) unpickled = pickle.loads(pickled) assert unpickled
def test_td3_inverted_double_pendulum(self): deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size trainer = Trainer(snapshot_config=snapshot_config) env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() trainer.setup(td3, env, sampler_cls=LocalSampler) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def td3_half_cheetah(ctxt=None, seed=1): """Train TD3 with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) n_epochs = 500 steps_per_epoch = 20 sampler_batch_size = 250 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size trainer = Trainer(ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) uniform_random_policy = UniformRandomPolicy(env.spec) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, exploration_policy=exploration_policy, uniform_random_policy=uniform_random_policy, target_update_tau=0.005, discount=0.99, policy_noise_clip=0.5, policy_noise=0.2, policy_lr=1e-3, qf_lr=1e-3, steps_per_epoch=40, start_steps=1000, grad_steps_per_env_step=50, min_buffer_size=1000, buffer_batch_size=100) trainer.setup(algo=td3, env=env) trainer.train(n_epochs=750, batch_size=100)
def td3_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow TD3 model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = GarageEnv(normalize(gym.make(env_id))) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, max_sigma=hyper_parameters['sigma'], min_sigma=hyper_parameters['sigma']) qf = ContinuousMLPQFunction( name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction( name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer( capacity_in_transitions=hyper_parameters['replay_buffer_size']) td3 = TD3(env.spec, policy=policy, qf=qf, qf2=qf2, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], smooth_return=hyper_parameters['smooth_return'], min_buffer_size=hyper_parameters['min_buffer_size'], buffer_batch_size=hyper_parameters['buffer_batch_size'], exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(td3, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_rollout_steps'])
def td3_garage_pytorch(ctxt, env_id, seed): """Create garage TensorFlow TD3 model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Localtrainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: num_timesteps = hyper_parameters['n_epochs'] * hyper_parameters[ 'steps_per_epoch'] * hyper_parameters['batch_size'] env = normalize(GymEnv(env_id)) policy = DeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, total_timesteps=num_timesteps, max_sigma=hyper_parameters['sigma'], min_sigma=hyper_parameters['sigma']) uniform_random_policy = UniformRandomPolicy(env.spec) qf1 = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer( capacity_in_transitions=hyper_parameters['replay_buffer_size']) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, exploration_policy=exploration_policy, uniform_random_policy=uniform_random_policy, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['target_update_tau'], discount=hyper_parameters['discount'], grad_steps_per_env_step=hyper_parameters[ 'grad_steps_per_env_step'], start_steps=hyper_parameters['start_steps'], min_buffer_size=hyper_parameters['min_buffer_size'], buffer_batch_size=hyper_parameters['buffer_batch_size'], policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_noise_clip=hyper_parameters['policy_noise_clip'], policy_noise=hyper_parameters['policy_noise']) prefer_gpu() td3.to() trainer.setup(td3, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])
def td3_pendulum(ctxt=None, seed=1): """Wrap TD3 training task in the run_task function. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: n_epochs = 500 steps_per_epoch = 20 sampler_batch_size = 250 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('InvertedDoublePendulum-v2') policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) td3 = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, target_update_tau=1e-2, steps_per_epoch=steps_per_epoch, n_train_steps=1, discount=0.99, buffer_batch_size=100, min_buffer_size=1e4, exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(td3, env) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def td3_garage_tf(ctxt, env_id, seed): """Create garage TensorFlow TD3 model and training. Args: ctxt (ExperimentContext): The experiment configuration used by :class:`~Trainer` to create the :class:`~Snapshotter`. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: num_timesteps = (hyper_parameters['n_epochs'] * hyper_parameters['steps_per_epoch'] * hyper_parameters['n_exploration_steps']) env = normalize(GymEnv(env_id)) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, total_timesteps=num_timesteps, max_sigma=hyper_parameters['sigma'], min_sigma=hyper_parameters['sigma']) qf = ContinuousMLPQFunction( name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction( name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer( capacity_in_transitions=hyper_parameters['replay_buffer_size']) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) td3 = TD3(env.spec, policy=policy, qf=qf, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['tau'], n_train_steps=hyper_parameters['n_train_steps'], discount=hyper_parameters['discount'], min_buffer_size=hyper_parameters['min_buffer_size'], buffer_batch_size=hyper_parameters['buffer_batch_size'], exploration_policy=exploration_policy, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) trainer.setup(td3, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['n_exploration_steps'])
def test_update(env): policy = ConstantPolicy(env.action_space.sample()) exp_policy = AddGaussianNoise(env, policy, total_timesteps=10, max_sigma=1., min_sigma=0.) exp_policy.get_action(None) exp_policy.get_action(None) DummyBatch = collections.namedtuple('EpisodeBatch', ['lengths']) batch = DummyBatch(np.array([1, 2])) # new sigma will be 1 - 0.1 * (1 + 2) = 0.7 exp_policy.update(batch) assert np.isclose(exp_policy._sigma(), 0.7) exp_policy.get_action(None) batch = DummyBatch(np.array([1, 1, 2])) # new sigma will be 0.7 - 0.1 * (1 + 1 + 2) = 0.3 exp_policy.update(batch) assert np.isclose(exp_policy._sigma(), 0.3)