def dense_layer_3d_proj(input_tensor, hidden_size, head_size, initializer, activation, name=None): """A dense layer with 3D kernel for projection. Args: input_tensor: float Tensor of shape [batch,from_seq_length, num_attention_heads, size_per_head]. hidden_size: The size of hidden layer. num_attention_heads: The size of output dimension. head_size: The size of head. initializer: Kernel initializer. activation: Actication function. name: The name scope of this layer. Returns: float logits Tensor. """ input_shape = albert_utils_official.get_shape_list(input_tensor) num_attention_heads= input_shape[2] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[num_attention_heads * head_size, hidden_size], initializer=initializer) w = tf.reshape(w, [num_attention_heads, head_size, hidden_size]) b = tf.get_variable( name="bias", shape=[hidden_size], initializer=tf.zeros_initializer) ret = tf.einsum("BFND,NDH->BFH", input_tensor, w) ret += b if activation is not None: return activation(ret) else: return ret
def dense_layer_2d(input_tensor, output_size, initializer, activation, num_attention_heads=1, name=None): """A dense layer with 2D kernel. Args: input_tensor: Float tensor with rank 3. output_size: The size of output dimension. initializer: Kernel initializer. activation: Activation function. num_attention_heads: number of attention head in attention layer. name: The name scope of this layer. Returns: float logits Tensor. """ del num_attention_heads # unused input_shape = albert_utils_official.get_shape_list(input_tensor) hidden_size = input_shape[2] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[hidden_size, output_size], initializer=initializer) b = tf.get_variable( name="bias", shape=[output_size], initializer=tf.zeros_initializer) ret = tf.einsum("BFH,HO->BFO", input_tensor, w) ret += b if activation is not None: return activation(ret) else: return ret
def dot_product_attention(q, k, v, bias, dropout_rate=0.0): """Dot-product attention. Args: q: Tensor with shape [..., length_q, depth_k]. k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must match with q. v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must match with q. bias: bias Tensor (see attention_bias()) dropout_rate: a float. Returns: Tensor with shape [..., length_q, depth_v]. """ logits = tf.matmul(q, k, transpose_b=True) # [..., length_q, length_kv] logits = tf.multiply( logits, 1.0 / math.sqrt(float(albert_utils_official.get_shape_list(q)[-1]))) if bias is not None: # `attention_mask` = [B, T] from_shape = albert_utils_official.get_shape_list(q) if len(from_shape) == 4: broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1], tf.float32) elif len(from_shape) == 5: # from_shape = [B, N, Block_num, block_size, depth]# broadcast_ones = tf.ones( [from_shape[0], 1, from_shape[2], from_shape[3], 1], tf.float32) bias = tf.matmul(broadcast_ones, tf.cast(bias, tf.float32), transpose_b=True) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - bias) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. logits += adder else: adder = 0.0 attention_probs = tf.nn.softmax(logits, name="attention_probs") attention_probs = dropout(attention_probs, dropout_rate) return tf.matmul(attention_probs, v)
def build_encoder(self, input_ids, input_mask, hidden_dropout_prob, attention_probs_dropout_prob, **kargs): reuse = kargs["reuse"] input_shape = albert_utils_official.get_shape_list( input_ids, expected_rank=[2, 3]) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope(self.config.get("scope", "bert"), reuse=reuse): with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. if kargs.get('attention_type', 'efficient_attention') == 'normal_attention': tf.logging.info("****** normal attention *******") transformer_model = albert_modules_official.transformer_model else: tf.logging.info("****** normal attention *******") transformer_model = albert_modules_official.transformer_model [ self.all_encoder_layers, self.all_attention_scores ] = albert_modules_official.transformer_model( input_tensor=self.embedding_output, attention_mask=input_mask, hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_hidden_groups=self.config.num_hidden_groups, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, inner_group_num=self.config.inner_group_num, intermediate_act_fn=albert_modules_official.get_activation( self.config.hidden_act), hidden_dropout_prob=self.config.hidden_dropout_prob, attention_probs_dropout_prob=self.config. attention_probs_dropout_prob, initializer_range=self.config.initializer_range, do_return_all_layers=True)
def embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): """Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialization range. word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.nn.embedding_lookup()`. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. """ # This function assumes that the input is of shape [batch_size, seq_length, # num_inputs]. # # If the input is a 2D tensor of shape [batch_size, seq_length], we # reshape to [batch_size, seq_length, 1]. if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range)) if use_one_hot_embeddings: flat_input_ids = tf.reshape(input_ids, [-1]) one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) output = tf.matmul(one_hot_input_ids, embedding_table) else: output = tf.nn.embedding_lookup(embedding_table, input_ids) input_shape = albert_utils_official.get_shape_list(input_ids) output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size]) return (output, embedding_table)
def gumbel_embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): """Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialization range. word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better for TPUs. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. """ # This function assumes that the input is of shape [batch_size, seq_length, # num_inputs]. # # If the input is a 2D tensor of shape [batch_size, seq_length], we # reshape to [batch_size, seq_length, 1]. input_shape = albert_utils_official.get_shape_list(input_ids, expected_rank=[3]) embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range)) output = tf.einsum("abc,cd->abd", tf.cast(input_ids, tf.float32), embedding_table) return (output, embedding_table)
def dense_layer_3d(input_tensor, num_attention_heads, head_size, initializer, activation, name=None): """A dense layer with 3D kernel. Args: input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. num_attention_heads: Number of attention heads. head_size: The size per attention head. initializer: Kernel initializer. activation: Actication function. name: The name scope of this layer. Returns: float logits Tensor. """ input_shape = albert_utils_official.get_shape_list(input_tensor) hidden_size = input_shape[2] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[hidden_size, num_attention_heads * head_size], initializer=initializer) w = tf.reshape(w, [hidden_size, num_attention_heads, head_size]) b = tf.get_variable( name="bias", shape=[num_attention_heads * head_size], initializer=tf.zeros_initializer) b = tf.reshape(b, [num_attention_heads, head_size]) ret = tf.einsum("BFH,HND->BFND", input_tensor, w) ret += b if activation is not None: return activation(ret) else: return ret
def build_embedder(self, input_ids, token_type_ids, hidden_dropout_prob, attention_probs_dropout_prob, **kargs): reuse = kargs["reuse"] input_shape = albert_utils_official.get_shape_list( input_ids, expected_rank=[2, 3]) batch_size = input_shape[0] seq_length = input_shape[1] if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) if self.config.get('embedding_scope', None): embedding_scope = self.config['embedding_scope'] other_embedding_scope = self.config.get("scope", "bert") tf.logging.info( "==using embedding scope of original model_config.embedding_scope: %s==", embedding_scope) else: embedding_scope = self.config.get("scope", "bert") other_embedding_scope = self.config.get("scope", "bert") tf.logging.info( "==using embedding scope of original model_config.scope: %s==", embedding_scope) with tf.variable_scope(embedding_scope, reuse=reuse): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output_word, self.embedding_table ) = albert_modules_official.embedding_lookup( input_ids=input_ids, vocab_size=self.config.vocab_size, embedding_size=self.config.embedding_size, initializer_range=self.config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=self.config.use_one_hot_embeddings) if kargs.get("perturbation", None): self.embedding_output_word += kargs["perturbation"] tf.logging.info( " add word pertubation for robust learning ") with tf.variable_scope(other_embedding_scope, reuse=reuse): with tf.variable_scope("embeddings"): # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = albert_modules_official.embedding_postprocessor( input_tensor=self.embedding_output_word, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=self.config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=self.config.initializer_range, max_position_embeddings=self.config. max_position_embeddings, dropout_prob=hidden_dropout_prob, token_type_ratio=self.config.get("token_type_ratio", 1.0))
def transformer_model(input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_hidden_groups=12, num_attention_heads=12, intermediate_size=3072, inner_group_num=1, intermediate_act_fn="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_hidden_groups: int. Number of group for the hidden layers, parameters in the same group are shared. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. inner_group_num: int, number of inner repetition of attention and ffn. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = hidden_size // num_attention_heads input_shape = albert_utils_official.get_shape_list(input_tensor, expected_rank=3) input_width = input_shape[2] all_layer_outputs = [] all_attention_scores = [] if input_width != hidden_size: prev_output = dense_layer_2d( input_tensor, hidden_size, create_initializer(initializer_range), None, name="embedding_hidden_mapping_in") else: prev_output = input_tensor with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE): for layer_idx in range(num_hidden_layers): group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups) with tf.variable_scope("group_%d" % group_idx): with tf.name_scope("layer_%d" % layer_idx): layer_output = prev_output for inner_group_idx in range(inner_group_num): with tf.variable_scope("inner_group_%d" % inner_group_idx): [layer_output, attention_scores] = attention_ffn_block( layer_output, hidden_size, attention_mask, num_attention_heads, attention_head_size, attention_probs_dropout_prob, intermediate_size, intermediate_act_fn, initializer_range, hidden_dropout_prob) prev_output = layer_output all_layer_outputs.append(layer_output) all_attention_scores.append(attention_scores) if do_return_all_layers: return all_layer_outputs, all_attention_scores else: return all_layer_outputs, all_attention_scores
def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, batch_size=None, from_seq_length=None, to_seq_length=None): """Performs multi-headed attention from `from_tensor` to `to_tensor`. Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ from_shape = albert_utils_official.get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = albert_utils_official.get_shape_list(to_tensor, expected_rank=[2, 3]) size_per_head = int(from_shape[2]/num_attention_heads) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # `query_layer` = [B, F, N, H] q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), query_act, "query") # `key_layer` = [B, T, N, H] k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), key_act, "key") # `value_layer` = [B, T, N, H] v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), value_act, "value") q = tf.transpose(q, [0, 2, 1, 3]) k = tf.transpose(k, [0, 2, 1, 3]) v = tf.transpose(v, [0, 2, 1, 3]) if attention_mask is not None: attention_mask = tf.reshape( attention_mask, [batch_size, 1, to_seq_length, 1]) # 'new_embeddings = [B, N, F, H]' new_embeddings = dot_product_attention(q, k, v, attention_mask, attention_probs_dropout_prob) return tf.transpose(new_embeddings, [0, 2, 1, 3])
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, token_type_ratio=1.0): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = albert_utils_official.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_ratio * token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. # position_embeddings = tf.slice(full_position_embeddings, [0, 0], # [seq_length, -1]) flat_pos_ids = tf.range(seq_length, dtype=tf.int32) one_hot_pos_ids = tf.one_hot(flat_pos_ids, depth=max_position_embeddings) position_embeddings = tf.matmul(one_hot_pos_ids, full_position_embeddings) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output