예제 #1
0
  def _compute_z_variance(self, z=None, logits=None, q=None, normalize=True):
    """Compute the return distribution variance. Only one of `z` and `logits` must be set
    Args:
      z: tf.Tensor, shape `[None, n_actions, N]`. Return atoms probabilities
      logits: tf.Tensor, shape `[None, n_actions, N]`. Logits of the return
      q: tf.Tensor, shape `[None, n_actions]`. Optionally provide a tensor for the Q-function
      normalize: bool. If True, normalize the variance values such that the mean of the
        return variances of all actions in a given state is 1.
    Returns:
      tf.Tensor of shape `[None, n_actions]`
    """
    assert (z is None) != (logits is None), "Only one of 'z' and 'logits' must be set"

    if logits is not None:
      z = tf_ops.softmax(logits, axis=-1)
    if q is None:
      q = tf.reduce_sum(z * self.bins, axis=-1, keepdims=True)
    else:
      q = tf.reshape(q, [-1] + z.shape.as_list()[1:])

    # Var(X) = sum_x p(X)*[X - E[X]]^2
    center  = self.bins - q                                   # out: [None, n_actions, N]
    z_var   = tf.square(center) * z                           # out: [None, n_actions, N]
    z_var   = tf.reduce_sum(z_var, axis=-1)                   # out: [None, n_actions]

    # Normalize the variance across the action axis
    if normalize:
      mean  = tf.reduce_mean(z_var, axis=-1, keepdims=True)   # out: [None, 1]
      z_var = z_var / mean                                    # out: [None, n_actions]

    return z_var
예제 #2
0
  def _act_train(self, agent_net, name):
    # Compute the Q-function as expectation of Z; output shape [None, n_actions]
    z       = tf_ops.softmax(agent_net, axis=-1)
    q       = tf.reduce_sum(z * self.bins, axis=-1)
    action  = tf.argmax(q, axis=-1, output_type=tf.int32, name=name)

    # Add debugging plot for the variance of the return
    z_var   = self._compute_z_variance(z=z, q=q, normalize=True)  # [None, n_actions]
    tf.summary.scalar("debug/z_var", tf.reduce_mean(z_var))
    tf.summary.histogram("debug/a_rho2", z_var)

    return dict(action=action)
예제 #3
0
  def _select_target(self, target_net):
    """Select the C51 target distributions - use the greedy action from E[Z]
    Args:
      target_net: `tf.Tensor`, shape `[None, n_actions, N]. The tensor output from `self._nn_model()`
        for the target
    Returns:
      `tf.Tensor` of shape `[None, N]`
    """
    n_actions   = self.n_actions
    target_z    = tf_ops.softmax(target_net, axis=-1)

    # Get the target Q probabilities for the greedy action; output shape [None, N]
    target_q    = tf.reduce_sum(target_z * self.bins, axis=-1)            # out: [None, n_actions]
    target_act  = tf.argmax(target_q, axis=-1, output_type=tf.int32)      # out: [None]
    target_mask = tf.one_hot(target_act, n_actions, dtype=tf.float32)     # out: [None, n_actions]
    target_mask = tf.expand_dims(target_mask, axis=-1)                    # out: [None, n_actions, 1]
    target_z    = tf.reduce_sum(target_z * target_mask, axis=1)           # out: [None, N]
    return target_z