def create_discrete_action_masking_layer(all_logits, action_masks, action_size): """ Creates a masking layer for the discrete actions :param all_logits: The concatenated unnormalized action probabilities for all branches :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action] :param action_size: A list containing the number of possible actions for each branch :return: The action output dimension [batch_size, num_branches], the concatenated normalized probs (after softmax) and the concatenated normalized log probs """ action_idx = [0] + list(np.cumsum(action_size)) branches_logits = [ all_logits[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size)) ] branch_masks = [ action_masks[:, action_idx[i]:action_idx[i + 1]] for i in range(len(action_size)) ] raw_probs = [ tf.multiply( tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k]) for k in range(len(action_size)) ] normalized_probs = [ tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True)) for k in range(len(action_size)) ] output = tf.concat( [ tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1) for k in range(len(action_size)) ], axis=1, ) return ( output, tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1), tf.concat( [ tf.log(normalized_probs[k] + EPSILON) for k in range(len(action_size)) ], axis=1, ), )
def create_discrete_action_masking_layer( branches_logits: List[tf.Tensor], action_masks: tf.Tensor, action_size: List[int], ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: """ Creates a masking layer for the discrete actions :param branches_logits: A List of the unnormalized action probabilities for each branch :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action] :param action_size: A list containing the number of possible actions for each branch :return: The action output dimension [batch_size, num_branches], the concatenated normalized probs (after softmax) and the concatenated normalized log probs """ branch_masks = ModelUtils.break_into_branches(action_masks, action_size) raw_probs = [ tf.multiply( tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k]) for k in range(len(action_size)) ] normalized_probs = [ tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True)) for k in range(len(action_size)) ] output = tf.concat( [ tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1) for k in range(len(action_size)) ], axis=1, ) return ( output, tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1), tf.concat( [ tf.log(normalized_probs[k] + EPSILON) for k in range(len(action_size)) ], axis=1, ), )