def setup_method(self): self.env = DummyDiscreteEnv() self.policy = SimplePolicy(env_spec=self.env) self.epsilon_greedy_policy = EpsilonGreedyPolicy(env_spec=self.env, policy=self.policy, total_timesteps=100, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) self.env.reset()
def setup(): set_seed(24) n_epochs = 11 steps_per_epoch = 10 sampler_batch_size = 512 num_timesteps = 100 * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.4) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, steps_per_epoch=steps_per_epoch, qf_lr=5e-5, discount=0.9, min_buffer_size=int(1e4), n_train_steps=500, target_update_freq=30, buffer_batch_size=64) return algo, env, replay_buffer, n_epochs, sampler_batch_size
def test_dqn_cartpole_pickle(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with TFTrainer(snapshot_config, sess=self.sess) as trainer: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) sampler = LocalSampler( agents=epilson_greedy_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, sampler=sampler, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) trainer.setup(algo, env) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/mlp/hidden_0', reuse=True): bias = tf.compat.v1.get_variable('bias') # assign it to all one old_bias = tf.ones_like(bias).eval() bias.load(old_bias) h = pickle.dumps(algo) with tf.compat.v1.Session(graph=tf.Graph()): pickle.loads(h) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/mlp/hidden_0', reuse=True): new_bias = tf.compat.v1.get_variable('bias') new_bias = new_bias.eval() assert np.array_equal(old_bias, new_bias) env.close()
class TestEpsilonGreedyPolicy: def setup_method(self): self.env = DummyDiscreteEnv() self.policy = SimplePolicy(env_spec=self.env) self.epsilon_greedy_policy = EpsilonGreedyPolicy(env_spec=self.env, policy=self.policy, total_timesteps=100, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) self.env.reset() def test_epsilon_greedy_policy(self): obs, _, _, _ = self.env.step(1) action, _ = self.epsilon_greedy_policy.get_action(obs) assert self.env.action_space.contains(action) # epsilon decay by 1 step, new epsilon = 1 - 0.98 = 0.902 random_rate = np.random.random( 100000) < self.epsilon_greedy_policy._epsilon assert np.isclose([0.902], [sum(random_rate) / 100000], atol=0.01) actions, _ = self.epsilon_greedy_policy.get_actions([obs] * 5) # epsilon decay by 6 steps in total, new epsilon = 1 - 6 * 0.98 = 0.412 random_rate = np.random.random( 100000) < self.epsilon_greedy_policy._epsilon assert np.isclose([0.412], [sum(random_rate) / 100000], atol=0.01) for action in actions: assert self.env.action_space.contains(action) def test_epsilon_greedy_policy_is_pickleable(self): obs, _, _, _ = self.env.step(1) for _ in range(5): self.epsilon_greedy_policy.get_action(obs) h_data = pickle.dumps(self.epsilon_greedy_policy) policy = pickle.loads(h_data) assert policy._epsilon == self.epsilon_greedy_policy._epsilon
def test_dqn_cartpole_pickle(self): """Test DQN with CartPole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): bias = tf.compat.v1.get_variable('bias') # assign it to all one old_bias = tf.ones_like(bias).eval() bias.load(old_bias) h = pickle.dumps(algo) with tf.compat.v1.Session(graph=tf.Graph()): pickle.loads(h) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): new_bias = tf.compat.v1.get_variable('bias') new_bias = new_bias.eval() assert np.array_equal(old_bias, new_bias) env.close()
def dqn_cartpole(ctxt=None, seed=1): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with TFTrainer(ctxt) as trainer: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=True, n_train_steps=500, target_network_update_freq=1, buffer_batch_size=32) trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def dqn_cartpole(ctxt=None, seed=24): """Train DQN with CartPole-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) runner = Trainer(ctxt) n_epochs = 100 steps_per_epoch = 10 sampler_batch_size = 512 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.4) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=5e-5, discount=0.9, min_buffer_size=int(1e4), n_train_steps=500, target_update_freq=30, buffer_batch_size=64) runner.setup(algo, env) runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) env.close()
def test_dqn_cartpole_grad_clip(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with TFTrainer(snapshot_config, sess=self.sess) as trainer: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GymEnv('CartPole-v0') replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) sampler = LocalSampler( agents=epilson_greedy_policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True, worker_class=FragmentWorker) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, sampler=sampler, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 8.8 env.close()
def dqn_cartpole(ctxt=None, seed=1): """Train TRPO with CubeCrash-v0 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, steps_per_epoch=steps_per_epoch, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=True, n_train_steps=500, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def train_dqn(ctxt=None): set_seed(seed) trainer = Trainer(ctxt) env = MyGymEnv(gym_env, max_episode_length=100) steps_per_epoch = 10 sampler_batch_size = 4000 num_timesteps = n_eps * steps_per_epoch * sampler_batch_size replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.4, ) sampler = LocalSampler( agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, ) self.algo = LoggedDQN( env=env, env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=5e-5, discount=0.99, min_buffer_size=int(1e4), n_train_steps=500, target_update_freq=30, buffer_batch_size=64, ) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=sampler_batch_size) return self.algo.rew_chkpts
def test_dqn_cartpole_grad_clip(self): """Test DQN with CartPole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 15 env.close()
def test_dqn_cartpole_double_q(self): """Test DQN with CartPole environment.""" deterministic.set_seed(100) with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = GarageEnv(gym.make('CartPole-v0')) replay_buffer = PathBuffer(capacity_in_transitions=int(1e4)) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=epilson_greedy_policy, replay_buffer=replay_buffer, max_path_length=100, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=True, n_train_steps=500, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) assert last_avg_ret > 9 env.close()
def dqn_pong(ctxt=None, seed=1, buffer_size=int(5e4), max_episode_length=500): """Train DQN on PongNoFrameskip-v4 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. buffer_size (int): Number of timesteps to store in replay buffer. max_episode_length (int): Maximum length of an episode, after which an episode is considered complete. This is used during testing to minimize the memory required to store a single episode. """ set_seed(seed) with TFTrainer(ctxt) as trainer: n_epochs = 100 steps_per_epoch = 20 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = gym.make('PongNoFrameskip-v4') env = env.unwrapped env = Noop(env, noop_max=30) env = MaxAndSkip(env, skip=4) env = EpisodicLife(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireReset(env) env = Grayscale(env) env = Resize(env, 84, 84) env = ClipReward(env) env = StackFrames(env, 4) env = GymEnv(env, is_image=True, max_episode_length=max_episode_length) replay_buffer = PathBuffer(capacity_in_transitions=buffer_size) qf = DiscreteCNNQFunction(env_spec=env.spec, filters=( (32, (8, 8)), (64, (4, 4)), (64, (3, 3)), ), strides=(4, 2, 1), dueling=False) # yapf: disable policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=0.99, min_buffer_size=int(1e4), double_q=False, n_train_steps=500, steps_per_epoch=steps_per_epoch, target_network_update_freq=2, buffer_batch_size=32) trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def dqn_atari(ctxt=None, env=None, seed=24, n_workers=psutil.cpu_count(logical=False), max_episode_length=None, **kwargs): """Train DQN with PongNoFrameskip-v4 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env (str): Name of the atari environment, eg. 'PongNoFrameskip-v4'. seed (int): Used to seed the random number generator to produce determinism. n_workers (int): Number of workers to use. Defaults to the number of CPU cores available. max_episode_length (int): Max length of an episode. If None, defaults to the timelimit specific to the environment. Used by integration tests. kwargs (dict): hyperparameters to be saved to variant.json. """ assert n_workers > 0 assert env is not None env = gym.make(env) env = Noop(env, noop_max=30) env = MaxAndSkip(env, skip=4) env = EpisodicLife(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireReset(env) env = Grayscale(env) env = Resize(env, 84, 84) env = ClipReward(env) env = StackFrames(env, 4, axis=0) env = GymEnv(env, max_episode_length=max_episode_length, is_image=True) set_seed(seed) trainer = Trainer(ctxt) n_epochs = hyperparams['n_epochs'] steps_per_epoch = hyperparams['steps_per_epoch'] sampler_batch_size = hyperparams['sampler_batch_size'] num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size replay_buffer = PathBuffer( capacity_in_transitions=hyperparams['buffer_size']) qf = DiscreteCNNQFunction( env_spec=env.spec, image_format='NCHW', hidden_channels=hyperparams['hidden_channels'], kernel_sizes=hyperparams['kernel_sizes'], strides=hyperparams['strides'], hidden_w_init=( lambda x: torch.nn.init.orthogonal_(x, gain=np.sqrt(2))), hidden_sizes=hyperparams['hidden_sizes']) policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy( env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=hyperparams['max_epsilon'], min_epsilon=hyperparams['min_epsilon'], decay_ratio=hyperparams['decay_ratio']) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, n_workers=n_workers) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, sampler=sampler, steps_per_epoch=steps_per_epoch, qf_lr=hyperparams['lr'], clip_gradient=hyperparams['clip_gradient'], discount=hyperparams['discount'], min_buffer_size=hyperparams['min_buffer_size'], n_train_steps=hyperparams['n_train_steps'], target_update_freq=hyperparams['target_update_freq'], buffer_batch_size=hyperparams['buffer_batch_size']) set_gpu_mode(False) torch.set_num_threads(1) if torch.cuda.is_available(): set_gpu_mode(True) algo.to() trainer.setup(algo, env) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) env.close()
class TestEpsilonGreedyPolicy: def setup_method(self): self.env = DummyDiscreteEnv() self.policy = SimplePolicy(env_spec=self.env) self.epsilon_greedy_policy = EpsilonGreedyPolicy(env_spec=self.env, policy=self.policy, total_timesteps=100, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) self.env.reset() def test_epsilon_greedy_policy(self): obs, _, _, _ = self.env.step(1) action, _ = self.epsilon_greedy_policy.get_action(obs) assert self.env.action_space.contains(action) # epsilon decay by 1 step, new epsilon = 1 - 0.098 = 0.902 random_rate = np.random.random( 100000) < self.epsilon_greedy_policy._epsilon() assert np.isclose([0.902], [sum(random_rate) / 100000], atol=0.01) actions, _ = self.epsilon_greedy_policy.get_actions([obs] * 5) # epsilon decay by 6 steps in total # new epsilon = 1 - 6 * 0.098 = 0.412 random_rate = np.random.random( 100000) < self.epsilon_greedy_policy._epsilon() assert np.isclose([0.412], [sum(random_rate) / 100000], atol=0.01) for action in actions: assert self.env.action_space.contains(action) def test_set_param(self): params = self.epsilon_greedy_policy.get_param_values() params['total_env_steps'] = 6 self.epsilon_greedy_policy.set_param_values(params) assert np.isclose(self.epsilon_greedy_policy._epsilon(), 0.412) def test_update(self): DummyBatch = collections.namedtuple('EpisodeBatch', ['lengths']) batch = DummyBatch(np.array([1, 2, 3])) self.epsilon_greedy_policy.update(batch) assert np.isclose(self.epsilon_greedy_policy._epsilon(), 0.412) def test_epsilon_greedy_policy_is_pickleable(self): obs, _, _, _ = self.env.step(1) for _ in range(5): self.epsilon_greedy_policy.get_action(obs) h_data = pickle.dumps(self.epsilon_greedy_policy) policy = pickle.loads(h_data) assert policy._epsilon() == self.epsilon_greedy_policy._epsilon()
def dqn_pong(ctxt=None, seed=1, buffer_size=int(5e4)): """Train DQN on PongNoFrameskip-v4 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. buffer_size (int): Number of timesteps to store in replay buffer. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: n_epochs = 100 steps_per_epoch = 20 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = gym.make('PongNoFrameskip-v4') env = Noop(env, noop_max=30) env = MaxAndSkip(env, skip=4) env = EpisodicLife(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireReset(env) env = Grayscale(env) env = Resize(env, 84, 84) env = ClipReward(env) env = StackFrames(env, 4) env = TfEnv(env, is_image=True) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=buffer_size, time_horizon=1) qf = DiscreteCNNQFunction(env_spec=env.spec, filter_dims=(8, 4, 3), num_filters=(32, 64, 64), strides=(4, 2, 1), dueling=False) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, policy=policy, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_policy=exploration_policy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=0.99, min_buffer_size=int(1e4), double_q=False, n_train_steps=500, steps_per_epoch=steps_per_epoch, target_network_update_freq=2, buffer_batch_size=32) runner.setup(algo, env) runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)