def _build(self, q_func): ##################### # q values, created with the placeholder that holds CURRENT obs (i.e., t) self.q_t_values = q_func(self.obs_t_ph, self.ac_dim, scope='q_func', reuse=False) self.q_t = tf.reduce_sum(self.q_t_values * tf.one_hot(self.act_t_ph, self.ac_dim), axis=1) ##################### # target q values, created with the placeholder that holds NEXT obs (i.e., t+1) q_tp1_values = q_func(self.obs_tp1_ph, self.ac_dim, scope='target_q_func', reuse=False) if self.double_q: # You must fill this part for Q2 of the Q-learning potion of the homework. # In double Q-learning, the best action is selected using the Q-network that # is being updated, but the Q-value for this action is obtained from the # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. q_tp1 = tf.reduce_sum( q_tp1_values * tf.one_hot(tf.argmax(self.q_t_values, axis=1), self.ac_dim), axis=1) else: # q values of the next timestep q_tp1 = tf.reduce_max(q_tp1_values, axis=1) ##################### # TODO calculate the targets for the Bellman error # HINT1: as you saw in lecture, this would be: #currentReward + self.gamma * qValuesOfNextTimestep * (1 - self.done_mask_ph) # HINT2: see above, where q_tp1 is defined as the q values of the next timestep # HINT3: see the defined placeholders and look for the one that holds current rewards target_q_t = self.rew_t_ph + self.gamma * q_tp1 * (1 - self.done_mask_ph) target_q_t = tf.stop_gradient(target_q_t) ##################### # TODO compute the Bellman error (i.e. TD error between q_t and target_q_t) # Note that this scalar-valued tensor later gets passed into the optimizer, to be minimized # HINT: use reduce mean of huber_loss (from infrastructure/dqn_utils.py) instead of squared error self.total_error = tf.reduce_mean(huber_loss(self.q_t - target_q_t)) ##################### # TODO these variables should all of the # variables of the Q-function network and target network, respectively # HINT1: see the "scope" under which the variables were constructed in the lines at the top of this function # HINT2: use tf.get_collection to look for all variables under a certain scope q_func_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_func') target_q_func_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func') ##################### # train_fn will be called in order to train the critic (by minimizing the TD error) self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = self.optimizer_spec.constructor( learning_rate=self.learning_rate, **self.optimizer_spec.kwargs) self.train_fn = minimize_and_clip(optimizer, self.total_error, var_list=q_func_vars, clip_val=self.grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) self.update_target_fn = tf.group(*update_target_fn)
def _build(self, q_func): ##################### # q values, created with the placeholder that holds CURRENT obs (i.e., t) # online network: Q_phi(s, a) self.q_t_values = q_func( self.obs_t_ph, self.ac_dim, scope='q_func', reuse=False) # reuse = False means to be an independant model self.q_t = tf.reduce_sum(self.q_t_values * tf.one_hot(self.act_t_ph, self.ac_dim), axis=1) # act like softmax ##################### # target q values, created with the placeholder that holds NEXT obs (i.e., t+1) # vector for a': Q_phi'(s', a') q_tp1_values = q_func(self.obs_tp1_ph, self.ac_dim, scope='target_q_func', reuse=False) if self.double_q: # You must fill this part for Q2 of the Q-learning potion of the homework. # In double Q-learning, the best action is selected using the Q-network that # is being updated, but the Q-value for this action is obtained from the # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. # Q_phi'(s', argmax_a'(Q_phi(s', a'))) q_t_values_for_tp1 = q_func(self.obs_tp1_ph, self.ac_dim, scope='q_func', reuse=True) # reuse the training model num_sample = tf.shape(self.obs_tp1_ph)[0] index = tf.stack([ tf.range(num_sample), tf.cast(tf.argmax(q_t_values_for_tp1, axis=1), tf.int32) ], axis=1) # build index q_tp1 = tf.gather_nd(q_tp1_values, index) else: # q values of the next timestep # Q_phi'(s', argmax_a'(Q_phi'(s', a'))) q_tp1 = tf.reduce_max(q_tp1_values, axis=1) ##################### # TODO calculate the targets for the Bellman error # HINT1: as you saw in lecture, this would be: #currentReward + self.gamma * qValuesOfNextTimestep * (1 - self.done_mask_ph) # HINT2: see above, where q_tp1 is defined as the q values of the next timestep # HINT3: see the defined placeholders and look for the one that holds current rewards # 这里target的定义是计算图 # AC当中target定义是直接给numpy target_q_t = self.rew_t_ph + (1 - self.done_mask_ph) * self.gamma * q_tp1 target_q_t = tf.stop_gradient( target_q_t ) # when doing (prediction - true) don't let gradient flow into true ##################### # TODO compute the Bellman error (i.e. TD error between q_t and target_q_t) # Note that this scalar-valued tensor later gets passed into the optimizer, to be minimized # HINT: use reduce mean of huber_loss (from infrastructure/dqn_utils.py) instead of squared error # 而不是mean square self.total_error = tf.reduce_mean(huber_loss(self.q_t - target_q_t)) ##################### # TODO these variables should all of the # variables of the Q-function network and target network, respectively # HINT1: see the "scope" under which the variables were constructed in the lines at the top of this function # HINT2: use tf.get_collection to look for all variables under a certain scope q_func_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_func') target_q_func_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func') ##################### # train_fn will be called in order to train the critic (by minimizing the TD error) self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = self.optimizer_spec.constructor( learning_rate=self.learning_rate, **self.optimizer_spec.kwargs) self.train_fn = minimize_and_clip(optimizer, self.total_error, var_list=q_func_vars, clip_val=self.grad_norm_clipping) ##################### # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): if (not self.hparams['use_polyak']): update_target_fn.append(var_target.assign(var)) else: update_target_fn.append( var_target.assign(0.0001 * var + 0.9999 * var_target)) self.update_target_fn = tf.group( *update_target_fn) # 总的赋值操作,tf.group往往用来组合op