示例#1
0
def test_target_net_weight_init():
    """
    Testing that the online & target net weights are the same after
    agent is created
    """
    tf.reset_default_graph()
    with tf.Session() as sess:
        agent, batch, env = setup_agent(sess, double_q=True)
        obs = batch['next_observation']

        online_vals, target_vals = sess.run(
            [agent.online_q_values, agent.target_q_values], {
                agent.observation: obs,
                agent.next_observation: obs
            })

        #  equal because we intialize target net weights in the init of DQN
        np.testing.assert_array_equal(online_vals, target_vals)

        online_vars = get_tf_params('online')
        target_vars = get_tf_params('target')

        o_vars, t_vars = sess.run([online_vars, target_vars])

        for o_v, t_v in zip(o_vars, t_vars):
            np.testing.assert_array_equal(o_v, t_v)
示例#2
0
def test_copy_ops():
    """
    Testing that different values of tau are working correctly
    """
    tf.reset_default_graph()
    with tf.Session() as sess:
        agent, batch, env = setup_agent(sess, double_q=True)

        #  at this point our target and online networks are the same
        #  (this is tested above in test_online_target_initial)

        #  do a train operation to change the online variables
        agent.learn()

        online_vars = get_tf_params('online')
        target_vars = get_tf_params('target')

        #  get the variable values before we do the copy op
        old_o_vars, old_t_vars = sess.run([online_vars, target_vars])

        #  do the copy operation with tau at 0.5
        _ = sess.run(agent.copy_ops, {agent.tau: 0.5})

        #  get the new variable values
        new_o_vars, new_t_vars = sess.run([online_vars, target_vars])

        #  check the online variables are the same
        check_o_vars = old_o_vars
        for v1, v2 in zip(check_o_vars, new_o_vars):
            np.testing.assert_array_equal(v1, v2)

        #  calculate what the new target net vars should be
        check_t_vars = []
        for v1, v2 in zip(old_o_vars, old_t_vars):
            new_arr = 0.5 * v1 + 0.5 * v2
            check_t_vars.append(new_arr)

        #  check that the new target vars are what they should be
        for v1, v2 in zip(check_t_vars, new_t_vars):
            np.testing.assert_array_equal(v1, v2)

        #  repeat the same logic with tau = 1
        _ = sess.run(agent.copy_ops, {agent.tau: 1.0})

        #  get the new variable values
        new_o_vars, new_t_vars = sess.run([online_vars, target_vars])

        #  check that the new target vars are what they should be
        for v1, v2 in zip(new_o_vars, new_t_vars):
            np.testing.assert_array_equal(v1, v2)
示例#3
0
    def build_learning_graph(self):
        with tf.variable_scope('target', reuse=False):
            self.target_q_values = feed_forward(
                'target',
                self.next_observation,
                self.observation_space.shape,
                self.layers,
                self.num_actions,
            )

        self.online_params = get_tf_params('online')
        self.target_params = get_tf_params('target')

        self.copy_ops, self.tau = make_copy_ops(
            self.online_params,
            self.target_params
        )

        with tf.variable_scope('bellman_target'):
            self.q_selected_actions = tf.reduce_sum(
                self.online_q_values * tf.one_hot(
                    self.selected_action_indicies,
                    self.num_actions
                ),
                1
            )

            if self.double_q:
                online_actions = tf.argmax(self.online_next_obs_q, axis=1)

                unmasked_next_state_max_q = tf.reduce_sum(
                    self.target_q_values * tf.one_hot(online_actions,
                                                 self.num_actions),
                    axis=1,
                    keepdims=True
                )

            else:
                unmasked_next_state_max_q = tf.reduce_max(
                    self.target_q_values,
                    reduction_indices=1,
                    keepdims=True
                )

            self.next_state_max_q = tf.where(
                self.terminal,
                tf.zeros_like(unmasked_next_state_max_q),
                unmasked_next_state_max_q,
		name='terminal_mask'
            )

            self.bellman = self.reward + self.discount * self.next_state_max_q

            #  batch norm requires some reshaping with a known rank
            #  reshape the input into batch norm, then flatten in loss
            #  training=True to normalize each batch
            #  training=False to use historical statistics
            bellman_norm = tf.layers.batch_normalization(
                tf.reshape(self.bellman, (-1, 1)),
                center=self.batch_norm_center,
                training=self.batch_norm_training,
                trainable=self.batch_norm_trainable
            )

        with tf.variable_scope('optimization'):
            error = tf.losses.huber_loss(
                tf.reshape(bellman_norm, (-1,)),
                self.q_selected_actions,
                scope='huber_loss'
            )

            loss = tf.reduce_mean(error)

            if self.learning_rate_decay:
                self.learning_rate = tf.train.exponential_decay(
                    self.learning_rate,
                    global_step=self.learn_step_tensor,
                    decay_steps=self.total_steps,
                    decay_rate=self.learning_rate_decay,
                    staircase=False,
                    name='learning_rate'
                )

            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

            with tf.variable_scope('gradient_clipping'):

                grads_and_vars = optimizer.compute_gradients(
                    loss,
                    var_list=self.online_params
                )

                for idx, (grad, var) in enumerate(grads_and_vars):
                    if grad is not None:
                        grads_and_vars[idx] = (tf.clip_by_norm(
                        grad, self.gradient_norm_clip),
                        var
                        )

                        self.learn_summaries.append(tf.summary.histogram(
                            '{}_gradient'.format(
                                var.name.replace(':', '_')),
                            grad)
                                              )

                self.train_op = optimizer.apply_gradients(grads_and_vars)

        self.act_summaries.extend([
            tf.summary.scalar('learning_rate', self.learning_rate),
            tf.summary.scalar('epsilon', self.epsilon),
            tf.summary.scalar('explore_toggle', self.explore_toggle),
                               ])

        self.act_summaries.extend([
            tf.summary.histogram(
                self.online_params[-1].name.replace(':', '_'),
                self.online_params[-1]),
            tf.summary.histogram(
                self.online_params[-2].name.replace(':', '_'),
                self.online_params[-2]),

            tf.summary.histogram(
                self.target_params[-1].name.replace(':', '_'),
                self.target_params[-1]),
            tf.summary.histogram(
                self.target_params[-2].name.replace(':', '_'),
                self.target_params[-2]),
                               ])

        self.learn_summaries.extend([
            tf.summary.histogram('bellman', self.bellman),
            tf.summary.histogram('bellman_norm', bellman_norm),
            tf.summary.scalar('loss', loss),
            tf.summary.histogram('unmasked_next_state_max_q', unmasked_next_state_max_q),
            tf.summary.histogram('next_state_max_q', self.next_state_max_q),
            tf.summary.histogram('target_q_values', self.target_q_values),
                               ])

        self.act_summaries = tf.summary.merge(self.act_summaries)
        self.learn_summaries = tf.summary.merge(self.learn_summaries)

        self.sess.run(
            tf.global_variables_initializer()
        )

        #  initialize the target net weights with the online weights
        self.sess.run(
            self.copy_ops,
            {self.tau: 1.0}
        )