def call(self, input_tensor, labels=None, training=None): last_dim = utils.get_shape_list(input_tensor)[-1] input_tensor = utils.dropout(input_tensor, self.dropout_prob, training) if self.w is None: self.w = tf.compat.v1.get_variable( name="kernel", shape=[last_dim, self.num_labels], initializer=self.initializer) self.initializer = None self._trainable_weights.append(self.w) logits = tf.matmul(input_tensor, self.w) if self.use_bias: if self.b is None: self.b = tf.compat.v1.get_variable( name="bias", shape=[self.num_labels], initializer=tf.zeros_initializer) self._trainable_weights.append(self.b) logits = tf.nn.bias_add(logits, self.b) log_probs = tf.nn.log_softmax(logits, axis=-1) if labels is not None: one_hot_labels = tf.one_hot(labels, depth=self.num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) else: loss = tf.constant(0.0) return loss, log_probs
def original_full_attention(query_layer, key_layer, value_layer, attention_mask, size_per_head, attention_probs_dropout_prob): """Full quadratic attention calculation. Args: query_layer: float Tensor of shape [batch_size, num_attention_heads, from_seq_length, size_per_head] key_layer: float Tensor of shape [batch_size, num_attention_heads, to_seq_length, size_per_head] value_layer: float Tensor of shape [batch_size, num_attention_heads, to_seq_length, size_per_head] attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. size_per_head: (optional) int. Size of each attention head. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. """ # Directly take n^2 dot product between "query" and "key". attention_scores = tf.einsum("BNFH,BNTH->BNFT", query_layer, key_layer) attention_scores = tf.multiply(attention_scores, 1.0 / np.sqrt(float(size_per_head))) if attention_mask is not None: # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = utils.dropout(attention_probs, attention_probs_dropout_prob) # `context_layer` = [B, F, N, H] context_layer = tf.einsum("BNFT,BNTH->BFNH", attention_probs, value_layer) return context_layer
def call(self, layer_input, encoder_outputs, self_attention_mask, attention_mask, cache=None, decode_i=None, training=None): """Implements a decoder layer of a transformer in BERT style. The layer_norm is taken after self-attention. Args: layer_input: float Tensor of shape [batch_size, seq_length, hidden_size]. encoder_outputs: tensors with shape [batch_size, input_length, num_hidden_layers, hidden_size] self_attention_mask: bias for decoder self-attention layer. [1, 1, target_length, target_length] attention_mask: bias for encoder-decoder attention layer. [batch_size, 1, 1, input_length] cache: (Used during prediction) A dictionary with tensors containing results of previous attentions. The dictionary must have the items: {"k": tensor with shape [batch_size, max_len, num_attention_heads, size_per_head], "v": tensor with shape [batch_size, max_len, num_attention_heads, size_per_head]} decode_i: (Used during prediction) current location of decoding training: Boolean indicating whether the call is training or inference. Returns: float Tensor of shape [batch_size, seq_length, hidden_size]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. NotImplementedError: For unknown attention type. """ with tf.compat.v1.variable_scope("attention"): with tf.compat.v1.variable_scope("self") as sc: self_attention_output = self.self_attn_layer( layer_input, layer_input, self_attention_mask, cache=cache, decode_i=decode_i, training=training, scope=sc) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.compat.v1.variable_scope("output"): self_attention_output = self.self_proj_layer(self_attention_output) self_attention_output = utils.dropout(self_attention_output, self.hidden_dropout_prob, training) self_attention_output = self.first_layer_norm( self_attention_output + layer_input) with tf.compat.v1.variable_scope("encdec") as sc: attention_output = self.cross_attn_layer( self_attention_output, encoder_outputs, attention_mask, training=training, scope=sc) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.compat.v1.variable_scope("encdec_output"): attention_output = self.cross_proj_layer(attention_output) attention_output = utils.dropout(attention_output, self.hidden_dropout_prob, training) attention_output = self.second_layer_norm( attention_output + self_attention_output) # The activation is only applied to the "intermediate" hidden layer. with tf.compat.v1.variable_scope("intermediate"): intermediate_output = self.expand_layer(attention_output) # Down-project back to `hidden_size` then add the residual. with tf.compat.v1.variable_scope("output"): layer_output = self.contract_layer(intermediate_output) layer_output = utils.dropout(layer_output, self.hidden_dropout_prob, training) layer_output = self.third_layer_norm(layer_output + attention_output) return layer_output
def operation(self, layer_input, attention_mask=None, band_mask=None, from_mask=None, to_mask=None, input_blocked_mask=None, training=None): """Implements a encoder layer of a transformer in Pegasus style. Args: layer_input: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. band_mask: (optional) int32 Tensor of shape [batch_size, 1, seq_length//block_size-4, block_size, 3*block_size]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. from_mask: (optional) int32 Tensor of shape [batch_size, 1, seq_length, 1]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. input_blocked_mask: (optional) int32 Tensor of shape [batch_size, seq_length//block_size, block_size]. Same as from/to_mask, just reshaped. training: Boolean indicating whether the call is training or inference. Returns: float Tensor of shape [batch_size, seq_length, hidden_size]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. NotImplementedError: For unknown attention type. """ with tf.compat.v1.variable_scope("attention"): with tf.compat.v1.variable_scope("self") as sc: normalized_layer_input = self.first_layer_norm(layer_input) attention_output = self.attn_layer.operation( normalized_layer_input, normalized_layer_input, attention_mask, band_mask, from_mask, to_mask, input_blocked_mask, input_blocked_mask, training, scope=sc) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.compat.v1.variable_scope("output"): attention_output = self.projection_layer.operation(attention_output) attention_output = utils.dropout(attention_output, self.hidden_dropout_prob, training) attention_output = attention_output + layer_input # The activation is only applied to the "intermediate" hidden layer. with tf.compat.v1.variable_scope("intermediate"): normalized_attention_output = self.second_layer_norm.operation(attention_output) intermediate_output = self.expand_layer.operation(normalized_attention_output) # Down-project back to `hidden_size` then add the residual. with tf.compat.v1.variable_scope("output"): layer_output = self.contract_layer.operation(intermediate_output) layer_output = utils.dropout(layer_output, self.hidden_dropout_prob, training) layer_output = layer_output + attention_output return layer_output