def build_model(self): """ Build the main networks. To improve the critic network, we want to compute the classical TD-error TDerr = [ r_t + gamma * Q_target(s_{t+1}, A(s_{t+1})) - Q(s_t, a_t) ]² (with A(.) the output of the actor network). To improve the actor network, we apply the policy gradient : Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) ) """ # Compute A(s_t) self.actions = build_actor(self.state_ph, trainable=True, scope='actor') # Compute Q(s_t, a_t) self.q_values_of_given_actions = build_critic(self.state_ph, self.action_ph, trainable=True, reuse=False, scope='critic') # Compute Q(s_t, A(s_t)) with the same network self.q_values_of_suggested_actions = build_critic(self.state_ph, self.actions, trainable=True, reuse=True, scope='critic')
def build_model(self): """ Build the main networks. To improve the critic network, we want to compute the cross-entropy loss between the projection on the support z of the target y = r_t + gamma * Q_target( s_{t+1}, A(s_{t+1}) ) and the Q-value at the time t Q(s_t, a_t) (with A(.) the output of the actor network). To improve the actor network, we apply the policy gradient : Grad = grad( Q(s_t, A(s_t)) ) * grad( A(s_t) ) """ # Compute A(s_t) self.actions = build_actor(self.state_ph, trainable=True, scope='learner_actor') # Compute Q(s_t, a_t) self.Q_distrib_given_actions = build_critic(self.state_ph, self.action_ph, trainable=True, reuse=False, scope='learner_critic') # Compute Q(s_t, A(s_t)) with the same network self.Q_distrib_suggested_actions = build_critic(self.state_ph, self.actions, trainable=True, reuse=True, scope='learner_critic') # Turn the distribution into value Qval(s_t, A(s_t)) self.Q_values_suggested_actions = tf.reduce_sum( self.z * self.Q_distrib_suggested_actions, axis=1)
def build_actor(self): """ Build a copy of the learner's actor network to allow the agent to interact with the environment on its own. """ scope = 'worker_agent_' + str(self.n_agent) self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None, *Settings.STATE_SIZE], name='state_ph') # Get the policy prediction network self.policy = build_actor(self.state_ph, trainable=False, scope=scope) self.vars = get_vars(scope, trainable=False)
def build_target(self): """ Build the target networks. """ # Compute A(s_{t+1}) self.target_next_actions = build_actor(self.next_state_ph, trainable=False, scope='learner_target_actor') # Compute Q_target( s_{t+1}, A(s_{t+1}) ) self.Q_distrib_next = build_critic(self.next_state_ph, self.target_next_actions, trainable=False, reuse=False, scope='learner_target_critic')
def build_target(self): """ Build the target networks. """ # Compute A(s_{t+1}) self.target_next_actions = build_actor(self.next_state_ph, trainable=False, scope='learner_target_actor') #print("3 intermediate action shape {}".format(self.target_next_actions.shape)) self.target_next_actions = tf.expand_dims(self.target_next_actions, 1) #print("3 intermediate action shape {}".format(self.target_next_actions.shape)) # Compute Q_target( s_{t+1}, A(s_{t+1}) ) self.Q_distrib_next = build_critic(self.next_state_ph, self.target_next_actions, trainable=False, reuse=False, scope='learner_target_critic', sess=self.sess)