def check_full_preprocessing(self): """ Manual check of the full set of preprocessing steps. Not run as part of normal unit tests; run me with ./preprocessing_test.py TestPreprocessing.check_full_preprocessing """ from pylab import subplot, imshow, show, tight_layout env = DummyEnv(dot_width=2, dot_height=2, draw_n_dots=True) env = NumberFrames(env) env_wrapped = generic_preprocess(env, max_n_noops=0) obs1 = env_wrapped.reset() obs2, _, _, _ = env_wrapped.step(0) obs3, _, _, _ = env_wrapped.step(0) obs4 = env_wrapped.reset() subplot(4, 1, 1) imshow(np.hstack(obs1), cmap='gray') subplot(4, 1, 2) imshow(np.hstack(obs2), cmap='gray') subplot(4, 1, 3) imshow(np.hstack(obs3), cmap='gray') subplot(4, 1, 4) imshow(np.hstack(obs4), cmap='gray') tight_layout() show()
def test_full_preprocessing_rewards(self): """ Check that rewards are summed correctly by Wrappers which operate over multiple timesteps. """ env = DummyEnv() env_wrapped = generic_preprocess(env, max_n_noops=0, clip_rewards=False) env_wrapped.reset() _, r1, _, _ = env_wrapped.step(0) _, r2, _, _ = env_wrapped.step(0) _, r3, _, _ = env_wrapped.step(0) # MaxWrapper skips the first step after reset (which gives reward 2) # FrameStackWrapper does another 3 steps after reset, each of which # does 4 steps in the raw environment because of FrameSkipWrapper. # Step 1: 3, 4, 5, 6 # Step 2: 7, 8, 9, 10 # Step 3: 11, 12, 13, 14 # The first step we do should get rewards 15, 16, 17 18, summed by # FrameSkipWrapper. self.assertEqual(r1, 66) # Then 19 + 20 + 21 + 22. self.assertEqual(r2, 82) # Then 23 + 24 + 25 + 27. self.assertEqual(r3, 98)
def main(): args = parse_args() env = gym.make(args.env_id) env = generic_preprocess(env, max_n_noops=0) sess, obs_placeholder, action_probs_op = \ get_network(args.ckpt_dir, env.observation_space.shape, env.action_space.n) run_agent(env, sess, obs_placeholder, action_probs_op)
def test_rmsprop_variables(self): """ Test 1: let's look at the variables the optimizer creates to check there's no funny business. """ sess = tf.Session() env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0) optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4, decay=0.99, epsilon=1e-5) with tf.variable_scope('global'): make_inference_network(n_actions=env.action_space.n, weight_inits='glorot') network1 = Network(scope="worker_1", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) Worker(sess=sess, env=env, network=network1, log_dir='/tmp') vars1 = optimizer.variables() network2 = Network(scope="worker_2", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) Worker(sess=sess, env=env, network=network2, log_dir='/tmp') vars2 = optimizer.variables() self.assertNotEqual(id(vars1), id(vars2)) # First, were any extra variables added when we created the second # optimizer, that might be indicative of a second set of statistics? self.assertLessEqual(vars1, vars2) # Second, are all the variables definitely associated with the global # set of parameters rather than the thead-local parameters? for v in vars1: self.assertIn('global', v.name)
def run_weight_test(reset_rmsprop): tf.reset_default_graph() utils.set_random_seeds(0) sess = tf.Session() env = generic_preprocess(gym.make('Pong-v0'), max_n_noops=0) env.seed(0) with tf.variable_scope('global'): make_inference_network(n_actions=env.action_space.n, weight_inits='glorot') shared_variables = tf.global_variables() optimizer = tf.train.RMSPropOptimizer(learning_rate=5e-4, decay=0.99, epsilon=1e-5) network1 = Network(scope="worker_1", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) w1 = Worker(sess=sess, env=env, network=network1, log_dir='/tmp') network2 = Network(scope="worker_2", n_actions=env.action_space.n, entropy_bonus=0.01, value_loss_coef=0.5, weight_inits='glorot', max_grad_norm=0.5, optimizer=optimizer, summaries=False, debug=False) w2 = Worker(sess=sess, env=env, network=network2, log_dir='/tmp') rmsprop_init_ops = [v.initializer for v in optimizer.variables()] sess.run(tf.global_variables_initializer()) vars_sum_init = sess.run(get_var_sum(shared_variables)) w1.run_update(n_steps=1) vars_sum_post_w1_update = sess.run(get_var_sum(shared_variables)) if reset_rmsprop: sess.run(rmsprop_init_ops) w2.run_update(n_steps=1) vars_sum_post_w2_update = sess.run(get_var_sum(shared_variables)) return vars_sum_init, vars_sum_post_w1_update, vars_sum_post_w2_update
def run_agent(env_id, sess, network): env = gym.make(env_id) env = generic_preprocess(env) while True: obs = env.reset() episode_reward = 0 done = False while not done: s = np.moveaxis(obs, 0, -1) feed_dict = {network.s: [s]} action_probs = sess.run(network.a_softmax, feed_dict)[0] action = np.random.choice(worker.ACTIONS, p=action_probs) obs, reward, done, _ = env.step(action) episode_reward += reward env.render() time.sleep(1/60.0) print("Episode reward:", episode_reward)