Exemplo n.º 1
0
    def build_model(self):
        """
        Build the main networks.
        
        To improve the critic network, we want to compute the classical TD-error
        TDerr = [ r_t + gamma * Q_target(s_{t+1}, A(s_{t+1})) - Q(s_t, a_t) ]²
        (with A(.) the output of the actor network).

        To improve the actor network, we apply the policy gradient :
        Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) )
        """

        # Compute A(s_t)
        self.actions = build_actor(self.state_ph,
                                   trainable=True,
                                   scope='actor')

        # Compute Q(s_t, a_t)
        self.q_values_of_given_actions = build_critic(self.state_ph,
                                                      self.action_ph,
                                                      trainable=True,
                                                      reuse=False,
                                                      scope='critic')
        # Compute Q(s_t, A(s_t)) with the same network
        self.q_values_of_suggested_actions = build_critic(self.state_ph,
                                                          self.actions,
                                                          trainable=True,
                                                          reuse=True,
                                                          scope='critic')
Exemplo n.º 2
0
    def build_model(self):
        """
        Build the main networks.
        
        To improve the critic network, we want to compute the cross-entropy loss
        between the projection on the support z of the target
        y = r_t + gamma * Q_target( s_{t+1}, A(s_{t+1}) ) and the Q-value at the
        time t Q(s_t, a_t) (with A(.) the output of the actor network).

        To improve the actor network, we apply the policy gradient :
        Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) )
        """

        # Compute A(s_t)
        self.actions = build_actor(self.state_ph,
                                   trainable=True,
                                   scope='learner_actor')

        # Compute Q(s_t, a_t)
        self.Q_distrib_given_actions = build_critic(self.state_ph,
                                                    self.action_ph,
                                                    trainable=True,
                                                    reuse=False,
                                                    scope='learner_critic')

        # Compute Q(s_t, A(s_t)) with the same network
        self.Q_distrib_suggested_actions = build_critic(self.state_ph,
                                                        self.actions,
                                                        trainable=True,
                                                        reuse=True,
                                                        scope='learner_critic')

        # Turn the distribution into value Qval(s_t, A(s_t))
        self.Q_values_suggested_actions = tf.reduce_sum(
            self.z * self.Q_distrib_suggested_actions, axis=1)
Exemplo n.º 3
0
    def build_actor(self):
        """
        Build a copy of the learner's actor network to allow the agent to
        interact with the environment on its own.
        """
        scope = 'worker_agent_' + str(self.n_agent)
        self.state_ph = tf.placeholder(dtype=tf.float32,
                                       shape=[None, *Settings.STATE_SIZE],
                                       name='state_ph')

        # Get the policy prediction network
        self.policy = build_actor(self.state_ph, trainable=False, scope=scope)
        self.vars = get_vars(scope, trainable=False)
Exemplo n.º 4
0
    def build_target(self):
        """
        Build the target networks.
        """
        # Compute A(s_{t+1})
        self.target_next_actions = build_actor(self.next_state_ph,
                                               trainable=False,
                                               scope='learner_target_actor')

        # Compute Q_target( s_{t+1}, A(s_{t+1}) )
        self.Q_distrib_next = build_critic(self.next_state_ph, self.target_next_actions,
                                           trainable=False, reuse=False,
                                           scope='learner_target_critic')
Exemplo n.º 5
0
    def build_target(self):
        """
        Build the target networks.
        """
        # Compute A(s_{t+1})
        self.target_next_actions = build_actor(self.next_state_ph,
                                               trainable=False,
                                               scope='learner_target_actor')
        #print("3 intermediate action shape {}".format(self.target_next_actions.shape))

        self.target_next_actions = tf.expand_dims(self.target_next_actions, 1)
        #print("3 intermediate action shape {}".format(self.target_next_actions.shape))

        # Compute Q_target( s_{t+1}, A(s_{t+1}) )
        self.Q_distrib_next = build_critic(self.next_state_ph,
                                           self.target_next_actions,
                                           trainable=False,
                                           reuse=False,
                                           scope='learner_target_critic',
                                           sess=self.sess)