def setUp(self): super().setUp() self.env = gym.make('MountainCarContinuous-v0') self.agent = DDPG(hidden_sizes=(4, )) self.obs_dim = 2 self.act_dim = 1 self.batch_size = 6 self.obs_ph = tf_utils.tfph(self.obs_dim, 'obs') self.act_ph = tf_utils.tfph(self.act_dim, 'act') self.is_term_ph = tf_utils.tfph(None, 'is_term') self.rew_ph = tf_utils.tfph(None, 'rew') self.placeholders = { 'obs': self.obs_ph, 'act': self.act_ph, 'next_obs': self.obs_ph, 'is_term': self.is_term_ph, 'rew': self.rew_ph } self.obs = np.random.randn(self.batch_size, self.obs_dim) self.act = np.random.randn(self.batch_size, self.act_dim) self.is_term = np.random.randint(0, 2, self.batch_size).astype(np.float32) self.rew = np.random.randn(self.batch_size) self.feed_dict = { self.obs_ph: self.obs, self.act_ph: self.act, self.is_term_ph: self.is_term, self.rew_ph: self.rew }
def test_build_policy_loss(self): """ make sure that when we train the loss, logp of actions with high advantage go up, and vice versa. Also check that kl-div goes up as we update the policy """ learning_rate = 1e-3 logp_old_ph = tfph(None) logp_old = np.log(np.random.rand(self.batch_size)).astype(np.float32) adv_ph = tfph(None) adv = np.random.randn(self.batch_size).astype(np.float32) logp = tf.get_variable('logp', dtype=tf.float32, trainable=True, initializer=logp_old) placeholders = {'logp': logp_old_ph, 'adv': adv} pi_loss, pi_train_op = self.ppo.build_policy_loss( logp, placeholders, learning_rate) feed_dict = {logp_old_ph: logp_old, adv_ph: adv} with self.cached_session() as sess: sess.run(tf.global_variables_initializer()) init_loss, init_kl = sess.run((pi_loss, self.ppo.kl_divergence), feed_dict=feed_dict) self.assertAlmostEqual(init_loss, -np.mean(adv), places=5) # since the new and old policies are the before training, kl # divergence should be zero self.assertAlmostEqual(init_kl, 0) sess.run(pi_train_op, feed_dict=feed_dict) after_loss, after_kl = sess.run((pi_loss, self.ppo.kl_divergence), feed_dict=feed_dict) # ensure the loss went down self.assertLess(after_loss, init_loss) delta_logp = sess.run(logp) - logp_old # ensure that logp goes up if adv > 0 and vice versa np.testing.assert_array_equal(np.sign(delta_logp), np.sign(adv)) # ensure that kl_div changed self.assertNotEqual(after_kl, init_kl)
def test_build_clipped_surrogate(self): """ smoke test build_clipped_surrogate """ sur_ph = tfph(None) max_sur_ph = tfph(None) sur = np.random.randn(self.batch_size) max_sur = np.random.randn(self.batch_size) expected = np.array([min(sur_i, max_sur_i) for (sur_i, max_sur_i) in zip(sur, max_sur)]) with self.cached_session() as sess: ret = sess.run( self.ppo.build_clipped_surrogate(sur_ph, max_sur_ph), feed_dict={sur_ph: sur, max_sur_ph: max_sur}) np.testing.assert_allclose(ret, expected)
def create_placeholders(self, obs_space, act_space): """ Build the placeholders required for this agent """ self.placeholders['obs'] = tfph(obs_space.shape[-1], name='obs') if isinstance(act_space, Box): self.placeholders['act'] = tfph(act_space.shape[-1], name='act') elif isinstance(act_space, Discrete): self.placeholders['act'] = tf.placeholder(dtype=tf.int64, shape=[None], name='act') else: raise NotImplementedError( 'action space {} not implemented'.format(act_space)) for name in ('ret', 'adv'): self.placeholders[name] = tfph(None, name=name)
def setUp(self): super().setUp() self.act_dim = 3 self.batch_size = 12 self.obs_dim = 5 self.obs_ph = tf_utils.tfph(self.obs_dim) self.obs = np.random.randn(self.batch_size, self.obs_dim)
def test_tfph_smoke(self): """ smoke test tfph """ x_dim = 3 x = np.random.rand(8, x_dim) x_ph = tf_utils.tfph(x_dim) with self.cached_session() as sess: ret = sess.run(x_ph, feed_dict={x_ph: x}) np.testing.assert_almost_equal(x, ret)
def test_tfph_None(self): """ test tfph when size is None""" x_dim = None x = np.random.rand(8) x_ph = tf_utils.tfph(None, name='x') self.assertTrue(x_ph.name.startswith('x')) with self.cached_session() as sess: ret = sess.run(x_ph, feed_dict={x_ph: x}) np.testing.assert_almost_equal(x, ret)
def test_build_max_surrogate(self): """ smoke test build_max_surrogate """ clip_ratio = 0.2 adv_ph = tfph(None) adv = np.random.randn(self.batch_size) expected = (1 + clip_ratio*np.sign(adv))*adv with self.cached_session() as sess: ret = sess.run(self.ppo.build_max_surrogate(clip_ratio, adv_ph), feed_dict={adv_ph: adv}) self.assertEqual(ret.shape, (self.batch_size,)) np.testing.assert_allclose(expected, ret)
def test_build_qval_target(self): """ test building targets for qval loss """ qval_pi_targ_ph = tf_utils.tfph(None, 'qval_pi_targ') qval_pi_targ_np = np.random.randn(self.batch_size) expected = self.rew \ + (1 - self.is_term)*self.agent.gamma*qval_pi_targ_np target = self.agent.build_qval_target(qval_pi_targ_ph, self.placeholders) feed_dict = {qval_pi_targ_ph: qval_pi_targ_np, **self.feed_dict} with self.cached_session() as sess: ret = sess.run(target, feed_dict=feed_dict) np.testing.assert_allclose(expected, ret)
def test_log_prob_of_action(self): """ smoke test log_prob_of_action """ log_probs_ph = tf_utils.tfph(self.n_cat) log_probs = np.random.rand(self.batch_size, self.n_cat) actions_ph = tf.placeholder(dtype=tf.int64, shape=[None]) actions = np.random.randint(0, self.n_cat, self.batch_size, np.int64) with self.cached_session() as sess: ret = sess.run(categorical.log_prob_of_action( log_probs_ph, actions_ph), feed_dict={ log_probs_ph: log_probs, actions_ph: actions }) for ind in range(self.batch_size): self.assertAlmostEqual(ret[ind], log_probs[ind, actions[ind]])
def test_build_value_function_smoke(self): """ check the num of trainable params and output shape make sense for build_value_func """ obs_dim = self.env.observation_space.shape[0] obs_ph = tf_utils.tfph(obs_dim) obs = np.random.rand(8, obs_dim) val = self.vpg.build_value_function(obs_ph, hidden_sizes=(4, ), activation=None) n_params = (obs_dim + 1) * 4 + (4 + 1) * 1 with self.cached_session() as sess: ret_n_params = tf_utils.trainable_count(scope='val') sess.run(tf.global_variables_initializer()) sess_val = sess.run(val, feed_dict={obs_ph: obs}) self.assertEqual(n_params, ret_n_params) self.assertEqual(sess_val.shape, (8, ))
def create_placeholders(self, obs_space, act_space): """ create placeholders """ if not isinstance(act_space, Box): raise NotImplementedError( "action space {} not implemented".format(type(act_space)) + "NOTE DDPG only compatible with continuous actions spaces") act_dim, obs_dim = act_space.shape[-1], obs_space.shape[-1] ph_shapes = { 'act': act_dim, 'obs': obs_dim, 'next_obs': obs_dim, 'is_term': None, 'rew': None } self.placeholders = { name: tf_utils.tfph(shape, name=name) for name, shape in ph_shapes.items() }
def test_mlp_categorical_policy(self): """ smoke test mlp_categorical_policy """ action_space = Mock() action_space.n = self.n_cat obs_ph = tf_utils.tfph(2) obs = np.random.rand(self.batch_size, 2) act_ph = tf.placeholder(dtype=tf.int64, shape=[None]) act = np.random.randint(0, self.n_cat, self.batch_size, np.int64) with self.cached_session() as sess: ret_symbol = categorical.mlp_categorical_policy( obs_ph, act_ph, hidden_sizes=(4, ), activation=tf.tanh, action_space=action_space) sess.run(tf.global_variables_initializer()) ret = sess.run(ret_symbol, feed_dict={obs_ph: obs, act_ph: act}) for val in ret: self.assertEqual(val.shape, (self.batch_size, ))
def test_build_val_loss_smoke(self): """ Make sure the loss goes down when training, and that training brings val closer to rets """ batch_size = 4 ret_ph = tf_utils.tfph(None) ret = np.ones(batch_size) val = tf.get_variable('val', dtype=tf.float32, trainable=True, initializer=4 * [0.]) loss, train_op = self.vpg.build_val_loss(val, ret_ph, 1e-3) with self.cached_session() as sess: sess.run(tf.global_variables_initializer()) old_loss = sess.run(loss, feed_dict={ret_ph: ret}) sess.run(train_op, feed_dict={ret_ph: ret}) new_loss = sess.run(loss, feed_dict={ret_ph: ret}) new_val = sess.run(val) self.assertEqual(new_loss.shape, tuple()) self.assertLess(new_loss, old_loss) self.assertTrue(all(new_val > 0))
def test_build_policy_loss_smoke(self): """ Make sure the loss goes down when training, and that training changes logp in the expected direction """ batch_size = 4 adv_ph = tfph(None) adv = np.ones(batch_size) logp = tf.get_variable('adv', dtype=tf.float32, trainable=True, initializer=batch_size * [0.]) loss, train_op = self.vpg.build_policy_loss(logp, {'adv': adv_ph}, learning_rate=1e-3) with self.cached_session() as sess: sess.run(tf.global_variables_initializer()) old_loss = sess.run(loss, feed_dict={adv_ph: adv}) sess.run(train_op, feed_dict={adv_ph: adv}) new_loss = sess.run(loss, feed_dict={adv_ph: adv}) new_logp = sess.run(logp) self.assertEqual(new_loss.shape, tuple()) self.assertLess(new_loss, old_loss) self.assertTrue(all(new_logp > 0))
def setUp(self): super().setUp() self.n_cat = 5 self.batch_size = 12 self.logits_ph = tf_utils.tfph(5) self.logits = np.random.rand(self.batch_size, self.n_cat)
def create_placeholders(self, obs_space, act_space): """ we need logp for the training the policy loss, so we'll add a placeholder for it here """ super().create_placeholders(obs_space, act_space) self.placeholders['logp'] = tfph(None, name='logp')