def test_qf_targets2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3.5]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[2.]]) next_obs = np.array([[2., 2., 2., 2.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2])) # = 3.5 + 0.5 * Q([2,2,2,2], 8) # = 3.5 + 0.5 * 16 # = 11.5 feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) self.assertNpEqual(np.array([[11.5]]), algo.sess.run(algo.ys, feed_dict=feed_dict))
def test_qf_targets(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3., 4.]) terminals = np.array([0., 0.]) obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]]) actions = np.array([[-0.5], [-0.5]]) next_obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target1 = 3 + 0.5 * Q([1,1,1,1], u([1,1,1,1])) # = 3 + 0.5 * Q([1,1,1,1], 4) # = 3 + 0.5 * 8 # = 7 # target2 = 8 feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) self.assertNpEqual(np.array([[7.], [8.]]), algo.sess.run(algo.ys, feed_dict=feed_dict))
def test_only_qf_values_change(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) old_qf_values = algo.qf.get_param_values() old_qf_copy_values = (algo.qf_with_action_input.get_param_values()) old_policy_values = algo.policy.get_param_values() old_target_qf_values = algo.target_qf.get_param_values() old_target_policy_values = algo.target_policy.get_param_values() rewards = np.array([3.]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[-0.5]]) next_obs = np.array([[1., 1., 1., 1.]]) feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) algo.sess.run(algo.train_qf_op, feed_dict=feed_dict) new_qf_values = algo.qf.get_param_values() new_qf_copy_values = (algo.qf_with_action_input.get_param_values()) new_policy_values = algo.policy.get_param_values() new_target_qf_values = algo.target_qf.get_param_values() new_target_policy_values = algo.target_policy.get_param_values() self.assertTrue( are_np_array_iterables_equal(old_policy_values, new_policy_values)) self.assertFalse( are_np_array_iterables_equal(old_qf_values, new_qf_values)) self.assertFalse( are_np_array_iterables_equal(old_qf_copy_values, new_qf_copy_values)) self.assertTrue( are_np_array_iterables_equal(old_target_policy_values, new_target_policy_values)) self.assertTrue( are_np_array_iterables_equal(old_target_qf_values, new_target_qf_values)) self.assertParamsEqual(algo.qf_with_action_input, algo.qf)
def test_qf_gradient(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3.5]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[1.]]) next_obs = np.array([[2., 2., 2., 2.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2])) # = 3.5 + 0.5 * Q([2,2,2,2], 8) # = 3.5 + 0.5 * 16 # = 11.5 # # dloss/dtheta = - 2 ( y - qf(obs, action)) * # d/dtheta (qf(obs, action)) # dloss/dtheta = - 2 ( y - qf([1,1,1,1], 1)) * # d/dtheta (qf(obs, action)) # dloss/dtheta = - 2 ( 11.5 - 5) * # d/dtheta (qf(obs, action)) # dloss/dtheta = - 13 * d/dtheta (qf(obs, action)) feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) grads = tf.gradients(algo.qf_loss, algo.qf.get_params_internal()) # qf_grads = algo.sess.run( # tf.gradients(algo.qf.output, algo.qf.get_vars())) expected = [-13. * np.ones_like(v) for v in algo.qf.get_param_values()] actual = algo.sess.run(grads, feed_dict=feed_dict) actual_flat = np.vstack(actual).flatten() self.assertTrue(are_np_array_iterables_equal(expected, actual_flat), "Numpy arrays not equal")
def test_qf_loss2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) rewards = np.array([3.5]) terminals = np.array([0.]) obs = np.array([[1., 1., 1., 1.]]) actions = np.array([[2.]]) next_obs = np.array([[2., 2., 2., 2.]]) # target = reward + discount * target_qf(next_obs, # target_policy(next_obs)) # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2])) # = 3.5 + 0.5 * Q([2,2,2,2], 8) # = 3.5 + 0.5 * 16 # = 11.5 # # loss = (target - qf(obs, action))^2 # = (target - qf([1,1,1,1], 2))^2 # = (target - 6)^2 # = (11.5 - 6)^2 # = (5.5)^2 # = 30.25 feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions, next_obs) actual = algo.sess.run(algo.qf_loss, feed_dict=feed_dict) self.assertEqual(30.25, actual) self.assertEqual(np.float32, type(actual))