def test_qf_targets2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3.5])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[2.]])
        next_obs = np.array([[2., 2., 2., 2.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2]))
        #        = 3.5 + 0.5 * Q([2,2,2,2], 8)
        #        = 3.5 + 0.5 * 16
        #        = 11.5
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        self.assertNpEqual(np.array([[11.5]]),
                           algo.sess.run(algo.ys, feed_dict=feed_dict))
    def test_qf_targets(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3., 4.])
        terminals = np.array([0., 0.])
        obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]])
        actions = np.array([[-0.5], [-0.5]])
        next_obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target1 = 3 + 0.5 * Q([1,1,1,1], u([1,1,1,1]))
        #         = 3 + 0.5 * Q([1,1,1,1], 4)
        #         = 3 + 0.5 * 8
        #         = 7
        # target2 = 8

        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        self.assertNpEqual(np.array([[7.], [8.]]),
                           algo.sess.run(algo.ys, feed_dict=feed_dict))
    def test_only_qf_values_change(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        old_qf_values = algo.qf.get_param_values()
        old_qf_copy_values = (algo.qf_with_action_input.get_param_values())
        old_policy_values = algo.policy.get_param_values()
        old_target_qf_values = algo.target_qf.get_param_values()
        old_target_policy_values = algo.target_policy.get_param_values()

        rewards = np.array([3.])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[-0.5]])
        next_obs = np.array([[1., 1., 1., 1.]])
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        algo.sess.run(algo.train_qf_op, feed_dict=feed_dict)

        new_qf_values = algo.qf.get_param_values()
        new_qf_copy_values = (algo.qf_with_action_input.get_param_values())
        new_policy_values = algo.policy.get_param_values()
        new_target_qf_values = algo.target_qf.get_param_values()
        new_target_policy_values = algo.target_policy.get_param_values()

        self.assertTrue(
            are_np_array_iterables_equal(old_policy_values, new_policy_values))
        self.assertFalse(
            are_np_array_iterables_equal(old_qf_values, new_qf_values))
        self.assertFalse(
            are_np_array_iterables_equal(old_qf_copy_values,
                                         new_qf_copy_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_target_policy_values,
                                         new_target_policy_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_target_qf_values,
                                         new_target_qf_values))
        self.assertParamsEqual(algo.qf_with_action_input, algo.qf)
    def test_qf_gradient(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3.5])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[1.]])
        next_obs = np.array([[2., 2., 2., 2.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2]))
        #        = 3.5 + 0.5 * Q([2,2,2,2], 8)
        #        = 3.5 + 0.5 * 16
        #        = 11.5
        #
        # dloss/dtheta = - 2 ( y - qf(obs, action)) *
        #                   d/dtheta (qf(obs, action))
        # dloss/dtheta = - 2 ( y - qf([1,1,1,1], 1)) *
        #                   d/dtheta (qf(obs, action))
        # dloss/dtheta = - 2 ( 11.5 - 5) *
        #                   d/dtheta (qf(obs, action))
        # dloss/dtheta = - 13 * d/dtheta (qf(obs, action))
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        grads = tf.gradients(algo.qf_loss, algo.qf.get_params_internal())
        # qf_grads = algo.sess.run(
        #         tf.gradients(algo.qf.output, algo.qf.get_vars()))
        expected = [-13. * np.ones_like(v) for v in algo.qf.get_param_values()]
        actual = algo.sess.run(grads, feed_dict=feed_dict)
        actual_flat = np.vstack(actual).flatten()
        self.assertTrue(are_np_array_iterables_equal(expected, actual_flat),
                        "Numpy arrays not equal")
    def test_qf_loss2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        rewards = np.array([3.5])
        terminals = np.array([0.])
        obs = np.array([[1., 1., 1., 1.]])
        actions = np.array([[2.]])
        next_obs = np.array([[2., 2., 2., 2.]])

        # target = reward + discount * target_qf(next_obs,
        #                                            target_policy(next_obs))
        # target = 3.5 + 0.5 * Q([2,2,2,2], u([2,2,2,2]))
        #        = 3.5 + 0.5 * Q([2,2,2,2], 8)
        #        = 3.5 + 0.5 * 16
        #        = 11.5
        #
        # loss = (target - qf(obs, action))^2
        #      = (target - qf([1,1,1,1], 2))^2
        #      = (target - 6)^2
        #      = (11.5 - 6)^2
        #      = (5.5)^2
        #      = 30.25
        feed_dict = algo._qf_feed_dict(rewards, terminals, obs, actions,
                                       next_obs)
        actual = algo.sess.run(algo.qf_loss, feed_dict=feed_dict)
        self.assertEqual(30.25, actual)
        self.assertEqual(np.float32, type(actual))