def _compute_z_variance(self, z=None, logits=None, q=None, normalize=True): """Compute the return distribution variance. Only one of `z` and `logits` must be set Args: z: tf.Tensor, shape `[None, n_actions, N]`. Return atoms probabilities logits: tf.Tensor, shape `[None, n_actions, N]`. Logits of the return q: tf.Tensor, shape `[None, n_actions]`. Optionally provide a tensor for the Q-function normalize: bool. If True, normalize the variance values such that the mean of the return variances of all actions in a given state is 1. Returns: tf.Tensor of shape `[None, n_actions]` """ assert (z is None) != (logits is None), "Only one of 'z' and 'logits' must be set" if logits is not None: z = tf_ops.softmax(logits, axis=-1) if q is None: q = tf.reduce_sum(z * self.bins, axis=-1, keepdims=True) else: q = tf.reshape(q, [-1] + z.shape.as_list()[1:]) # Var(X) = sum_x p(X)*[X - E[X]]^2 center = self.bins - q # out: [None, n_actions, N] z_var = tf.square(center) * z # out: [None, n_actions, N] z_var = tf.reduce_sum(z_var, axis=-1) # out: [None, n_actions] # Normalize the variance across the action axis if normalize: mean = tf.reduce_mean(z_var, axis=-1, keepdims=True) # out: [None, 1] z_var = z_var / mean # out: [None, n_actions] return z_var
def _act_train(self, agent_net, name): # Compute the Q-function as expectation of Z; output shape [None, n_actions] z = tf_ops.softmax(agent_net, axis=-1) q = tf.reduce_sum(z * self.bins, axis=-1) action = tf.argmax(q, axis=-1, output_type=tf.int32, name=name) # Add debugging plot for the variance of the return z_var = self._compute_z_variance(z=z, q=q, normalize=True) # [None, n_actions] tf.summary.scalar("debug/z_var", tf.reduce_mean(z_var)) tf.summary.histogram("debug/a_rho2", z_var) return dict(action=action)
def _select_target(self, target_net): """Select the C51 target distributions - use the greedy action from E[Z] Args: target_net: `tf.Tensor`, shape `[None, n_actions, N]. The tensor output from `self._nn_model()` for the target Returns: `tf.Tensor` of shape `[None, N]` """ n_actions = self.n_actions target_z = tf_ops.softmax(target_net, axis=-1) # Get the target Q probabilities for the greedy action; output shape [None, N] target_q = tf.reduce_sum(target_z * self.bins, axis=-1) # out: [None, n_actions] target_act = tf.argmax(target_q, axis=-1, output_type=tf.int32) # out: [None] target_mask = tf.one_hot(target_act, n_actions, dtype=tf.float32) # out: [None, n_actions] target_mask = tf.expand_dims(target_mask, axis=-1) # out: [None, n_actions, 1] target_z = tf.reduce_sum(target_z * target_mask, axis=1) # out: [None, N] return target_z