Exemplo n.º 1
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_error1, td_error2 = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = tf.reduce_mean(huber_loss(td_error1, delta=self.max_grad) * weights) + \
                              tf.reduce_mean(huber_loss(td_error2, delta=self.max_grad) * weights)

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            self._it.assign_add(1)
            with tf.GradientTape() as tape:
                next_actions = self.actor(states)
                actor_loss = - \
                    tf.reduce_mean(self.critic([states, next_actions]))

            if tf.math.equal(self._it % self._actor_update_freq, 0):
                actor_grad = tape.gradient(actor_loss,
                                           self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, tf.abs(td_error1) + tf.abs(
                td_error2)
Exemplo n.º 2
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_errors = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = tf.reduce_mean(
                    huber_loss(td_errors, delta=self.max_grad) * weights)

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            with tf.GradientTape() as tape:
                next_action = self.actor(states)
                actor_loss = -tf.reduce_mean(self.critic([states, next_action
                                                          ]))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, td_errors
Exemplo n.º 3
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_error1, td_error2 = self._compute_td_error_body(
                    states, actions, next_states, rewards, done)
                critic_loss = (
                    tf.reduce_mean(
                        huber_loss(td_error1, delta=self.max_grad) * weights) +
                    tf.reduce_mean(
                        huber_loss(td_error2, delta=self.max_grad) * weights))

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            self._it.assign_add(1)
            with tf.GradientTape() as tape:
                next_actions = self.actor(states)
                actor_loss = -tf.reduce_mean(self.critic(states, next_actions))

            remainder = tf.math.mod(self._it, self._actor_update_freq)

            def optimize_actor():
                actor_grad = tape.gradient(actor_loss,
                                           self.actor.trainable_variables)
                return self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

            tf.cond(pred=tf.equal(remainder, 0),
                    true_fn=optimize_actor,
                    false_fn=tf.no_op)
            # Update target networks
            update_target_variables(self.critic_target.weights,
                                    self.critic.weights, self.tau)
            update_target_variables(self.actor_target.weights,
                                    self.actor.weights, self.tau)

            return actor_loss, critic_loss, tf.abs(td_error1) + tf.abs(
                td_error2)
Exemplo n.º 4
0
    def _train_body(self, states, actions, next_states, rewards, done,
                    weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                if self._enable_categorical_dqn:
                    td_errors = self._compute_td_error_body_distributional(
                        states, actions, next_states, rewards, done)
                    q_func_loss = tf.reduce_mean(
                        huber_loss(tf.negative(td_errors), delta=self.max_grad)
                        * weights)
                else:
                    td_errors = self._compute_td_error_body(
                        states, actions, next_states, rewards, done)
                    q_func_loss = tf.reduce_mean(
                        huber_loss(td_errors, delta=self.max_grad) * weights)

            q_func_grad = tape.gradient(q_func_loss,
                                        self.q_func.trainable_variables)
            self.q_func_optimizer.apply_gradients(
                zip(q_func_grad, self.q_func.trainable_variables))

            return td_errors, q_func_loss
Exemplo n.º 5
0
    def test_huber_loss(self):
        """Test of huber loss
        huber_loss() allows two types of inputs:
        - `y_target` and `y_pred`
        - `diff`
        """
        # [1, 1] -> [0.5, 0.5]
        loss = huber_loss(np.array([1., 1.]), delta=1.)
        np.testing.assert_array_equal(
            np.array([0.5, 0.5]),
            loss.numpy())

        # [0,0] and [10, 10] -> [9.5, 9.5]
        loss = huber_loss(np.array([10., 10.]), delta=1.)
        np.testing.assert_array_equal(
            np.array([9.5, 9.5]),
            loss.numpy())

        # [0,0] and [-1, -2] -> [0.5, 1.5]
        loss = huber_loss(np.array([-1., -2.]), delta=1.)
        np.testing.assert_array_equal(
            np.array([0.5, 1.5]),
            loss.numpy())
Exemplo n.º 6
0
Arquivo: dqn.py Projeto: ymd-h/tf2rl
    def _train_body(self, states, actions, next_states, rewards, dones, weights):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                td_errors = self._compute_td_error_body(
                    states, actions, next_states, rewards, dones)
                q_func_loss = tf.reduce_mean(
                    huber_loss(td_errors,
                               delta=self.max_grad) * weights)

            q_func_grad = tape.gradient(
                q_func_loss, self.q_func.trainable_variables)
            self.q_func_optimizer.apply_gradients(
                zip(q_func_grad, self.q_func.trainable_variables))

            return td_errors, q_func_loss
Exemplo n.º 7
0
    def _train_body(self,
                    states,
                    actions,
                    next_states,
                    rewards,
                    done,
                    weights=None):
        with tf.device(self.device):
            batch_size = states.shape[0]
            not_dones = 1. - tf.cast(done, dtype=tf.float32)
            actions = tf.cast(actions, dtype=tf.int32)

            indices = tf.concat(
                values=[tf.expand_dims(tf.range(batch_size), axis=1), actions],
                axis=1)

            with tf.GradientTape(persistent=True) as tape:
                # Compute critic loss
                _, _, next_action_param = self.actor(next_states)
                next_action_prob = next_action_param["prob"]
                next_action_logp = tf.math.log(next_action_prob + 1e-8)
                next_q = tf.minimum(self.qf1_target(next_states),
                                    self.qf2_target(next_states))

                target_q = tf.expand_dims(tf.einsum(
                    'ij,ij->i', next_action_prob,
                    next_q - self.alpha * next_action_logp),
                                          axis=1)  # Eq.(10)
                target_q = tf.stop_gradient(rewards + not_dones *
                                            self.discount * target_q)

                current_q1 = self.qf1(states)
                current_q2 = self.qf2(states)

                td_loss1 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q1, indices), axis=1),
                               delta=self.max_grad))
                td_loss2 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q2, indices), axis=1),
                               delta=self.max_grad))  # Eq.(7)

                # Compute actor loss
                _, _, current_action_param = self.actor(states)
                current_action_prob = current_action_param["prob"]
                current_action_logp = tf.math.log(current_action_prob + 1e-8)

                policy_loss = tf.reduce_mean(
                    tf.einsum(
                        'ij,ij->i', current_action_prob,
                        self.alpha * current_action_logp - tf.stop_gradient(
                            tf.minimum(current_q1, current_q2))))  # Eq.(12)
                mean_ent = tf.reduce_mean(
                    tf.einsum('ij,ij->i', current_action_prob,
                              current_action_logp)) * (-1)

            q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            update_target_variables(self.qf1_target.weights,
                                    self.qf1.weights,
                                    tau=self.tau)
            update_target_variables(self.qf2_target.weights,
                                    self.qf2.weights,
                                    tau=self.tau)

            actor_grad = tape.gradient(policy_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

        return (td_loss1 + td_loss2) / 2., policy_loss, mean_ent, \
               tf.reduce_min(current_action_logp), tf.reduce_max(current_action_logp)
Exemplo n.º 8
0
    def _train_body(self, states, actions, next_states, rewards, dones, weights):
        with tf.device(self.device):
            if tf.rank(rewards) == 2:
                rewards = tf.squeeze(rewards, axis=1)
            not_dones = 1. - tf.cast(dones, dtype=tf.float32)

            with tf.GradientTape(persistent=True) as tape:
                # Compute loss of critic Q
                current_q1 = self.qf1([states, actions])
                current_q2 = self.qf2([states, actions])
                vf_next_target = self.vf_target(next_states)

                target_q = tf.stop_gradient(
                    rewards + not_dones * self.discount * vf_next_target)

                td_loss_q1 = tf.reduce_mean(huber_loss(
                    target_q - current_q1, delta=self.max_grad) * weights)
                td_loss_q2 = tf.reduce_mean(huber_loss(
                    target_q - current_q2, delta=self.max_grad) * weights)  # Eq.(7)

                # Compute loss of critic V
                current_v = self.vf(states)

                sample_actions, logp, _ = self.actor(states)  # Resample actions to update V
                current_q1 = self.qf1([states, sample_actions])
                current_q2 = self.qf2([states, sample_actions])
                current_min_q = tf.minimum(current_q1, current_q2)

                target_v = tf.stop_gradient(
                    current_min_q - self.alpha * logp)
                td_errors = target_v - current_v
                td_loss_v = tf.reduce_mean(
                    huber_loss(td_errors, delta=self.max_grad) * weights)  # Eq.(5)

                # Compute loss of policy
                policy_loss = tf.reduce_mean(
                    (self.alpha * logp - current_min_q) * weights)  # Eq.(12)

                # Compute loss of temperature parameter for entropy
                if self.auto_alpha:
                    alpha_loss = -tf.reduce_mean(
                        (self.log_alpha * tf.stop_gradient(logp + self.target_alpha)))

            q1_grad = tape.gradient(td_loss_q1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss_q2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            vf_grad = tape.gradient(td_loss_v, self.vf.trainable_variables)
            self.vf_optimizer.apply_gradients(
                zip(vf_grad, self.vf.trainable_variables))
            update_target_variables(
                self.vf_target.weights, self.vf.weights, self.tau)

            actor_grad = tape.gradient(
                policy_loss, self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            if self.auto_alpha:
                alpha_grad = tape.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))
                self.alpha.assign(tf.exp(self.log_alpha))

            del tape

        return td_errors, policy_loss, td_loss_v, td_loss_q1, tf.reduce_min(logp), tf.reduce_max(logp), tf.reduce_mean(logp)
Exemplo n.º 9
0
    def _train_body(self,
                    states,
                    actions,
                    next_states,
                    rewards,
                    done,
                    weights=None):
        with tf.device(self.device):
            rewards = tf.squeeze(rewards, axis=1)
            not_done = 1. - tf.cast(done, dtype=tf.float32)

            # Update Critic
            with tf.GradientTape(persistent=True) as tape:
                current_Q1 = self.qf1([states, actions])
                current_Q2 = self.qf2([states, actions])
                vf_next_target = self.vf_target(next_states)

                target_Q = tf.stop_gradient(self.scale_reward * rewards +
                                            not_done * self.discount *
                                            vf_next_target)

                td_loss1 = tf.reduce_mean(
                    huber_loss(target_Q - current_Q1, delta=self.max_grad))
                td_loss2 = tf.reduce_mean(
                    huber_loss(target_Q - current_Q2, delta=self.max_grad))

            q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            del tape

            with tf.GradientTape(persistent=True) as tape:
                current_V = self.vf(states)
                sample_actions, logp = self.actor(states)

                current_Q1 = self.qf1([states, sample_actions])
                current_Q2 = self.qf2([states, sample_actions])
                current_Q = tf.minimum(current_Q1, current_Q2)

                target_V = tf.stop_gradient(current_Q - logp)
                td_errors = target_V - current_V
                vf_loss_t = tf.reduce_mean(
                    huber_loss(td_errors, delta=self.max_grad) * weights)

                # TODO: Add reguralizer
                policy_loss = tf.reduce_mean(logp - current_Q1)

            vf_grad = tape.gradient(vf_loss_t, self.vf.trainable_variables)
            self.vf_optimizer.apply_gradients(
                zip(vf_grad, self.vf.trainable_variables))
            update_target_variables(self.vf_target.weights, self.vf.weights,
                                    self.tau)

            actor_grad = tape.gradient(policy_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape

        return td_errors, policy_loss, vf_loss_t, td_loss1, tf.reduce_min(
            logp), tf.reduce_max(logp)
Exemplo n.º 10
0
    def _train_body(self, states, actions, next_states, rewards, dones,
                    weights):
        with tf.device(self.device):
            batch_size = states.shape[0]
            not_dones = 1. - tf.cast(dones, dtype=tf.float32)
            actions = tf.cast(actions, dtype=tf.int32)

            indices = tf.concat(
                values=[tf.expand_dims(tf.range(batch_size), axis=1), actions],
                axis=1)

            with tf.GradientTape(persistent=True) as tape:
                # Compute critic loss
                next_action_prob = self.actor(next_states)
                next_action_logp = tf.math.log(next_action_prob + 1e-8)
                next_q = tf.minimum(self.qf1_target(next_states),
                                    self.qf2_target(next_states))

                # Compute state value function V by directly computes expectation
                target_q = tf.expand_dims(tf.einsum(
                    'ij,ij->i', next_action_prob,
                    next_q - self.alpha * next_action_logp),
                                          axis=1)  # Eq.(10)
                target_q = tf.stop_gradient(rewards + not_dones *
                                            self.discount * target_q)

                current_q1 = self.qf1(states)

                current_q2 = self.qf2(states)

                td_loss1 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q1, indices), axis=1),
                               delta=self.max_grad) * weights)
                td_loss2 = tf.reduce_mean(
                    huber_loss(target_q - tf.expand_dims(
                        tf.gather_nd(current_q2, indices), axis=1),
                               delta=self.max_grad) * weights)  # Eq.(7)

                # Compute actor loss
                current_action_prob = self.actor(states)
                current_action_logp = tf.math.log(current_action_prob + 1e-8)

                policy_loss = tf.reduce_mean(
                    tf.einsum(
                        'ij,ij->i', current_action_prob,
                        self.alpha * current_action_logp -
                        tf.stop_gradient(tf.minimum(current_q1, current_q2))) *
                    weights)  # Eq.(12)
                mean_ent = tf.reduce_mean(
                    tf.einsum('ij,ij->i', current_action_prob,
                              current_action_logp)) * (-1)

                if self.auto_alpha:
                    alpha_loss = -tf.reduce_mean(
                        (self.log_alpha *
                         tf.stop_gradient(current_action_logp +
                                          self.target_alpha)))

            q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables)
            self.qf1_optimizer.apply_gradients(
                zip(q1_grad, self.qf1.trainable_variables))
            q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables)
            self.qf2_optimizer.apply_gradients(
                zip(q2_grad, self.qf2.trainable_variables))

            if self.target_hard_update:
                if self.n_training % self.target_update_interval == 0:
                    update_target_variables(self.qf1_target.weights,
                                            self.qf1.weights,
                                            tau=1.)
                    update_target_variables(self.qf2_target.weights,
                                            self.qf2.weights,
                                            tau=1.)
            else:
                update_target_variables(self.qf1_target.weights,
                                        self.qf1.weights,
                                        tau=self.tau)
                update_target_variables(self.qf2_target.weights,
                                        self.qf2.weights,
                                        tau=self.tau)

            actor_grad = tape.gradient(policy_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            if self.auto_alpha:
                alpha_grad = tape.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))
                self.alpha.assign(tf.exp(self.log_alpha))

        return (td_loss1 + td_loss2) / 2., policy_loss, mean_ent, \
            tf.reduce_min(current_action_logp), tf.reduce_max(current_action_logp), \
            tf.reduce_mean(current_action_logp)