def test_target_net_weight_init(): """ Testing that the online & target net weights are the same after agent is created """ tf.reset_default_graph() with tf.Session() as sess: agent, batch, env = setup_agent(sess, double_q=True) obs = batch['next_observation'] online_vals, target_vals = sess.run( [agent.online_q_values, agent.target_q_values], { agent.observation: obs, agent.next_observation: obs }) # equal because we intialize target net weights in the init of DQN np.testing.assert_array_equal(online_vals, target_vals) online_vars = get_tf_params('online') target_vars = get_tf_params('target') o_vars, t_vars = sess.run([online_vars, target_vars]) for o_v, t_v in zip(o_vars, t_vars): np.testing.assert_array_equal(o_v, t_v)
def test_copy_ops(): """ Testing that different values of tau are working correctly """ tf.reset_default_graph() with tf.Session() as sess: agent, batch, env = setup_agent(sess, double_q=True) # at this point our target and online networks are the same # (this is tested above in test_online_target_initial) # do a train operation to change the online variables agent.learn() online_vars = get_tf_params('online') target_vars = get_tf_params('target') # get the variable values before we do the copy op old_o_vars, old_t_vars = sess.run([online_vars, target_vars]) # do the copy operation with tau at 0.5 _ = sess.run(agent.copy_ops, {agent.tau: 0.5}) # get the new variable values new_o_vars, new_t_vars = sess.run([online_vars, target_vars]) # check the online variables are the same check_o_vars = old_o_vars for v1, v2 in zip(check_o_vars, new_o_vars): np.testing.assert_array_equal(v1, v2) # calculate what the new target net vars should be check_t_vars = [] for v1, v2 in zip(old_o_vars, old_t_vars): new_arr = 0.5 * v1 + 0.5 * v2 check_t_vars.append(new_arr) # check that the new target vars are what they should be for v1, v2 in zip(check_t_vars, new_t_vars): np.testing.assert_array_equal(v1, v2) # repeat the same logic with tau = 1 _ = sess.run(agent.copy_ops, {agent.tau: 1.0}) # get the new variable values new_o_vars, new_t_vars = sess.run([online_vars, target_vars]) # check that the new target vars are what they should be for v1, v2 in zip(new_o_vars, new_t_vars): np.testing.assert_array_equal(v1, v2)
def build_learning_graph(self): with tf.variable_scope('target', reuse=False): self.target_q_values = feed_forward( 'target', self.next_observation, self.observation_space.shape, self.layers, self.num_actions, ) self.online_params = get_tf_params('online') self.target_params = get_tf_params('target') self.copy_ops, self.tau = make_copy_ops( self.online_params, self.target_params ) with tf.variable_scope('bellman_target'): self.q_selected_actions = tf.reduce_sum( self.online_q_values * tf.one_hot( self.selected_action_indicies, self.num_actions ), 1 ) if self.double_q: online_actions = tf.argmax(self.online_next_obs_q, axis=1) unmasked_next_state_max_q = tf.reduce_sum( self.target_q_values * tf.one_hot(online_actions, self.num_actions), axis=1, keepdims=True ) else: unmasked_next_state_max_q = tf.reduce_max( self.target_q_values, reduction_indices=1, keepdims=True ) self.next_state_max_q = tf.where( self.terminal, tf.zeros_like(unmasked_next_state_max_q), unmasked_next_state_max_q, name='terminal_mask' ) self.bellman = self.reward + self.discount * self.next_state_max_q # batch norm requires some reshaping with a known rank # reshape the input into batch norm, then flatten in loss # training=True to normalize each batch # training=False to use historical statistics bellman_norm = tf.layers.batch_normalization( tf.reshape(self.bellman, (-1, 1)), center=self.batch_norm_center, training=self.batch_norm_training, trainable=self.batch_norm_trainable ) with tf.variable_scope('optimization'): error = tf.losses.huber_loss( tf.reshape(bellman_norm, (-1,)), self.q_selected_actions, scope='huber_loss' ) loss = tf.reduce_mean(error) if self.learning_rate_decay: self.learning_rate = tf.train.exponential_decay( self.learning_rate, global_step=self.learn_step_tensor, decay_steps=self.total_steps, decay_rate=self.learning_rate_decay, staircase=False, name='learning_rate' ) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) with tf.variable_scope('gradient_clipping'): grads_and_vars = optimizer.compute_gradients( loss, var_list=self.online_params ) for idx, (grad, var) in enumerate(grads_and_vars): if grad is not None: grads_and_vars[idx] = (tf.clip_by_norm( grad, self.gradient_norm_clip), var ) self.learn_summaries.append(tf.summary.histogram( '{}_gradient'.format( var.name.replace(':', '_')), grad) ) self.train_op = optimizer.apply_gradients(grads_and_vars) self.act_summaries.extend([ tf.summary.scalar('learning_rate', self.learning_rate), tf.summary.scalar('epsilon', self.epsilon), tf.summary.scalar('explore_toggle', self.explore_toggle), ]) self.act_summaries.extend([ tf.summary.histogram( self.online_params[-1].name.replace(':', '_'), self.online_params[-1]), tf.summary.histogram( self.online_params[-2].name.replace(':', '_'), self.online_params[-2]), tf.summary.histogram( self.target_params[-1].name.replace(':', '_'), self.target_params[-1]), tf.summary.histogram( self.target_params[-2].name.replace(':', '_'), self.target_params[-2]), ]) self.learn_summaries.extend([ tf.summary.histogram('bellman', self.bellman), tf.summary.histogram('bellman_norm', bellman_norm), tf.summary.scalar('loss', loss), tf.summary.histogram('unmasked_next_state_max_q', unmasked_next_state_max_q), tf.summary.histogram('next_state_max_q', self.next_state_max_q), tf.summary.histogram('target_q_values', self.target_q_values), ]) self.act_summaries = tf.summary.merge(self.act_summaries) self.learn_summaries = tf.summary.merge(self.learn_summaries) self.sess.run( tf.global_variables_initializer() ) # initialize the target net weights with the online weights self.sess.run( self.copy_ops, {self.tau: 1.0} )