Exemplo n.º 1
0
    def build_model(self):
        """
        Build the main networks.
        
        To improve the critic network, we want to compute the cross-entropy loss
        between the projection on the support z of the target
        y = r_t + gamma * Q_target( s_{t+1}, A(s_{t+1}) ) and the Q-value at the
        time t Q(s_t, a_t) (with A(.) the output of the actor network).

        To improve the actor network, we apply the policy gradient :
        Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) )
        """

        # Compute A(s_t)
        self.actions = build_actor(self.state_ph,
                                   trainable=True,
                                   scope='learner_actor')

        # Compute Q(s_t, a_t)
        self.Q_distrib_given_actions = build_critic(self.state_ph,
                                                    self.action_ph,
                                                    trainable=True,
                                                    reuse=False,
                                                    scope='learner_critic')

        # Compute Q(s_t, A(s_t)) with the same network
        self.Q_distrib_suggested_actions = build_critic(self.state_ph,
                                                        self.actions,
                                                        trainable=True,
                                                        reuse=True,
                                                        scope='learner_critic')

        # Turn the distribution into value Qval(s_t, A(s_t))
        self.Q_values_suggested_actions = tf.reduce_sum(
            self.z * self.Q_distrib_suggested_actions, axis=1)
Exemplo n.º 2
0
    def build_model(self):
        """
        Build the main networks.
        
        To improve the critic network, we want to compute the classical TD-error
        TDerr = [ r_t + gamma * Q_target(s_{t+1}, A(s_{t+1})) - Q(s_t, a_t) ]²
        (with A(.) the output of the actor network).

        To improve the actor network, we apply the policy gradient :
        Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) )
        """

        # Compute A(s_t)
        self.actions = build_actor(self.state_ph,
                                   trainable=True,
                                   scope='actor')

        # Compute Q(s_t, a_t)
        self.q_values_of_given_actions = build_critic(self.state_ph,
                                                      self.action_ph,
                                                      trainable=True,
                                                      reuse=False,
                                                      scope='critic')
        # Compute Q(s_t, A(s_t)) with the same network
        self.q_values_of_suggested_actions = build_critic(self.state_ph,
                                                          self.actions,
                                                          trainable=True,
                                                          reuse=True,
                                                          scope='critic')
Exemplo n.º 3
0
    def build_networks(self):
        """
        Build the main network that predicts the Q-value distribution of a
        given state.
        
        Also build the operation to compute Q(s_t, a_t) for the gradient
        descent.
        Reminder :
            if simple DQN:
                y_t = r_t + gamma * max_a Q_target(s_{t+n}, a)
                    = r_t + gamma * Q_target( s_{t+n}, argmax_a Q_target(s_{t+n}, a) )
            elif double DQN:
                y_t = r_t + gamma * Q_target( s_{t+n}, argmax_a Q(s_{t+n}, a) )
            TD-error = y_t - Q(s_t, a_t)
        """
        # Compute Q(s_t, .)
        self.Q_st = build_critic(self.state_ph,
                                 trainable=True,
                                 reuse=False,
                                 scope='main_network')

        # Compute Q(s_t, a_t)
        ind = tf.stack((tf.range(Settings.BATCH_SIZE), self.action_ph), axis=1)
        self.Q_st_at = tf.gather_nd(self.Q_st, ind)

        # Compute Q_target(s_{t+n}, .)
        Q_target_st_n = build_critic(self.next_state_ph,
                                     trainable=False,
                                     reuse=False,
                                     scope='target_network')

        # If not double DQN, choose the best next action of the target network
        # Elif double DQN, choose the best next action of the main network
        if not Settings.DOUBLE_DQN:
            # Reuse Q_target(s_{t+n}, .)
            Q_st_n_max_a = Q_target_st_n
        else:
            # Compute Q(s_{t+n}, .)
            Q_st_n_max_a = build_critic(self.next_state_ph,
                                        trainable=True,
                                        reuse=True,
                                        scope='main_network')

        # Transform the distribution into the value to get the argmax
        if Settings.DISTRIBUTIONAL:
            Q_st_n_max_a = tf.reduce_sum(self.z * Q_st_n_max_a, axis=2)

        # Compute argmax_a Q[_target](s_{t+n}, a)
        best_at_n = tf.argmax(Q_st_n_max_a, 1, output_type=tf.int32)

        # Compute Q_target(s_{t+n}, argmax_a Q[_target](s_{t+n}, a))
        ind = tf.stack((tf.range(Settings.BATCH_SIZE), best_at_n), axis=1)
        self.Q_target_st_n_at_n = tf.gather_nd(Q_target_st_n, ind)
Exemplo n.º 4
0
    def build_target(self):
        """
        Build the operation to compute max_a Q(s_{t+1}, a) for the gradient
        descent.       
        Reminder : TD-error = (r_t + gamma * max_a Q(s_{t+1}, a) - Q(s_t, a_t)
        """

        # Computate Q(s_{t+1}, .)
        self.target_Q_distrib = build_critic(self.next_state_ph,
                                             trainable=False,
                                             scope='target_network')

        # Distribution -> value and selection to get the action that maximizes
        # the target Q-value a* = argmax_a Q(s_{t+1}, a)
        self.target_Q_value = tf.reduce_sum(self.z * self.target_Q_distrib,
                                            axis=2)
        self.target_action = tf.argmax(self.target_Q_value,
                                       1,
                                       output_type=tf.int32)

        # Selection of the maximum target Q-value distribution
        # max_a Q(s_{t+1}, a) == Q(s_{t+1}, a*)
        ind = tf.stack((tf.range(self.batch_size), self.target_action), axis=1)
        self.target_Q_distrib_optimal_action = tf.gather_nd(
            self.target_Q_distrib, ind)
Exemplo n.º 5
0
    def build_target(self):
        """
        Build the target networks.
        """
        # Compute A(s_{t+1})
        self.target_next_actions = build_actor(self.next_state_ph,
                                               trainable=False,
                                               scope='learner_target_actor')

        # Compute Q_target( s_{t+1}, A(s_{t+1}) )
        self.Q_distrib_next = build_critic(self.next_state_ph, self.target_next_actions,
                                           trainable=False, reuse=False,
                                           scope='learner_target_critic')
Exemplo n.º 6
0
    def build_target(self):

        # Compute Q_target(s_{t+1}, .)
        self.Q_distrib_next_target = build_critic(self.next_state_ph,
                                                  trainable=False,
                                                  reuse=False,
                                                  scope='target_network')

        # Compute Q_target(s_{t+1}, argmax_a Q(s_{t+1}, a))
        ind = tf.stack((tf.range(Settings.BATCH_SIZE), self.best_next_action),
                       axis=1)
        self.Q_distrib_next_target_best_action = tf.gather_nd(
            self.Q_distrib_next_target, ind)
Exemplo n.º 7
0
    def build_main_network(self):

        # Compute Q(s_t, .)
        self.Q_distrib = build_critic(self.state_ph,
                                      trainable=True,
                                      reuse=False,
                                      scope='main_network')

        # Compute Q(s_t, a_t)
        ind = tf.stack((tf.range(Settings.BATCH_SIZE), self.action_ph), axis=1)
        self.Q_distrib_main_action = tf.gather_nd(self.Q_distrib, ind)

        # Compute Q(s_{t+1}, .)
        self.Q_distrib_next = build_critic(self.next_state_ph,
                                           trainable=True,
                                           reuse=True,
                                           scope='main_network')

        self.Q_value_next = tf.reduce_sum(self.z * self.Q_distrib_next, axis=2)

        # Compute argmax_a Q(s_{t+1}, a)
        self.best_next_action = tf.argmax(self.Q_value_next,
                                          1,
                                          output_type=tf.int32)
Exemplo n.º 8
0
    def build_main_network(self):
        """
        Build the main network that predicts the Q-value distribution of a
        given state.
        
        Also build the operation to compute Q(s_t, a_t) for the gradient
        descent.
        Reminder : TD-error = (r_t + gamma * max_a Q(s_{t+1}, a) - Q(s_t, a_t)
        """

        # Computate Q(s_t, .)
        self.Q_distrib = build_critic(self.state_ph,
                                      trainable=True,
                                      scope='main_network')

        # Select only the Q-distribution of the action given in the experience,
        # i.e. compute Q(s_t, a_t)
        ind = tf.stack((tf.range(self.batch_size), self.action_ph), axis=1)
        self.Q_distrib_taken_action = tf.gather_nd(self.Q_distrib, ind)
Exemplo n.º 9
0
    def build_target(self):
        """
        Build the target networks.
        """
        # Compute A(s_{t+1})
        self.target_next_actions = build_actor(self.next_state_ph,
                                               trainable=False,
                                               scope='learner_target_actor')
        #print("3 intermediate action shape {}".format(self.target_next_actions.shape))

        self.target_next_actions = tf.expand_dims(self.target_next_actions, 1)
        #print("3 intermediate action shape {}".format(self.target_next_actions.shape))

        # Compute Q_target( s_{t+1}, A(s_{t+1}) )
        self.Q_distrib_next = build_critic(self.next_state_ph,
                                           self.target_next_actions,
                                           trainable=False,
                                           reuse=False,
                                           scope='learner_target_critic',
                                           sess=self.sess)