def call(self, inputs, return_all_layers=True, **kwargs): """Implements call() for the layer. Args: inputs: packed inputs. return_all_layers: bool, whether to return outputs of all layers inside encoders. Returns: Output tensor of the last layer or a list of output tensors. """ unpacked_inputs = tf_utils.unpack_inputs(inputs) input_tensor = unpacked_inputs[0] attention_mask = unpacked_inputs[1] output_tensor = input_tensor all_layer_outputs = [] all_layer_attentions = [] for i in range(self.num_hidden_layers): output_tensor, attention_score = self.shared_layers[i]( output_tensor, attention_mask, **kwargs) all_layer_outputs.append(output_tensor) all_layer_attentions.append(attention_score) if return_all_layers: return all_layer_outputs, all_layer_attentions return all_layer_outputs[-1], all_layer_attentions[-1]
def call(self, inputs): """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) lm_output = unpacked_inputs[0] sentence_output = unpacked_inputs[1] lm_label_ids = unpacked_inputs[2] lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1]) lm_label_ids_one_hot = tf.keras.backend.one_hot( lm_label_ids, self.config.vocab_size) lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3], tf.float32) lm_label_weights = tf.keras.backend.reshape(lm_label_weights, [-1]) lm_per_example_loss = -tf.keras.backend.sum( lm_output * lm_label_ids_one_hot, axis=[-1]) numerator = tf.keras.backend.sum(lm_label_weights * lm_per_example_loss) denominator = tf.keras.backend.sum(lm_label_weights) + 1e-5 mask_label_loss = numerator / denominator sentence_labels = unpacked_inputs[4] sentence_labels = tf.keras.backend.reshape(sentence_labels, [-1]) sentence_label_one_hot = tf.keras.backend.one_hot(sentence_labels, 2) per_example_loss_sentence = -tf.keras.backend.sum( sentence_label_one_hot * sentence_output, axis=-1) sentence_loss = tf.keras.backend.mean(per_example_loss_sentence) loss = mask_label_loss + sentence_loss # TODO(hongkuny): Avoids the hack and switches add_loss. final_loss = tf.fill(tf.keras.backend.shape(per_example_loss_sentence), loss) self._add_metrics(lm_output, lm_label_ids, lm_label_weights, lm_per_example_loss, sentence_output, sentence_labels, per_example_loss_sentence) return final_loss
def call(self, inputs, **kwargs): """Implements call() for the layer.""" (input_tensor, attention_mask) = tf_utils.unpack_inputs(inputs) attention_output, attention_score = self.attention_layer( from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=attention_mask, **kwargs) attention_output = self.attention_output_dense(attention_output) attention_output = self.attention_dropout(attention_output, training=kwargs.get( 'training', False)) # Use float32 in keras layer norm and the gelu activation in the # intermediate dense layer for numeric stability attention_output = self.attention_layer_norm(input_tensor + attention_output) if self.float_type == tf.float16: attention_output = tf.cast(attention_output, tf.float16) intermediate_output = self.intermediate_dense(attention_output) if self.float_type == tf.float16: intermediate_output = tf.cast(intermediate_output, tf.float16) layer_output = self.output_dense(intermediate_output) layer_output = self.output_dropout(layer_output, training=kwargs.get( 'training', False)) # Use float32 in keras layer norm for numeric stability layer_output = self.output_layer_norm(layer_output + attention_output) if self.float_type == tf.float16: layer_output = tf.cast(layer_output, tf.float16) return layer_output, attention_score
def call(self, inputs, mode="bert", **kwargs): """Implements call() for the layer. Args: inputs: packed input tensors. mode: string, `bert` or `encoder`. Returns: Output tensor of the last layer for BERT training (mode=`bert`) which is a float Tensor of shape [batch_size, seq_length, hidden_size] or a list of output tensors for encoder usage (mode=`encoder`). """ unpacked_inputs = tf_utils.unpack_inputs(inputs) input_word_ids = unpacked_inputs[0] input_mask = unpacked_inputs[1] input_type_ids = unpacked_inputs[2] word_embeddings = self.embedding_lookup(input_word_ids) embedding_tensor = self.embedding_postprocessor( word_embeddings=word_embeddings, token_type_ids=input_type_ids) if self.float_type == tf.float16: embedding_tensor = tf.cast(embedding_tensor, tf.float16) attention_mask = None if input_mask is not None: attention_mask = create_attention_mask_from_input_mask( input_word_ids, input_mask) # if mode == "encoder": # return self.encoder( # embedding_tensor, attention_mask, return_all_layers=True) sequence_output, attention_scores = self.encoder( embedding_tensor, attention_mask, return_all_layers=True) first_token_tensor = tf.squeeze(sequence_output[-1][:, 0:1, :], axis=1) pooled_output = self.pooler_transform(first_token_tensor) return (pooled_output, sequence_output, attention_scores, embedding_tensor)
def call(self, inputs, **kwargs): """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) word_embeddings = unpacked_inputs[0] token_type_ids = unpacked_inputs[1] input_shape = tf_utils.get_shape_list(word_embeddings, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = word_embeddings if self.use_type_embeddings: flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=self.token_type_vocab_size, dtype=self.dtype) token_type_embeddings = tf.matmul(one_hot_ids, self.type_embeddings) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if self.use_position_embeddings: position_embeddings = tf.expand_dims(tf.slice( self.position_embeddings, [0, 0], [seq_length, width]), axis=0) output += position_embeddings output = self.output_layer_norm(output) output = self.output_dropout(output, training=kwargs.get('training', False)) return output
def call(self, inputs): """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) lm_output = unpacked_inputs[0] sentence_output = unpacked_inputs[1] lm_label_ids = unpacked_inputs[2] lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3], tf.float32) sentence_labels = unpacked_inputs[4] mask_label_loss = losses.weighted_sparse_categorical_crossentropy_loss( labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights) sentence_loss = losses.weighted_sparse_categorical_crossentropy_loss( labels=sentence_labels, predictions=sentence_output) loss = mask_label_loss + sentence_loss batch_shape = tf.slice(tf.keras.backend.shape(sentence_labels), [0], [1]) # TODO(hongkuny): Avoids the hack and switches add_loss. final_loss = tf.fill(batch_shape, loss) self._add_metrics(lm_output, lm_label_ids, lm_label_weights, mask_label_loss, sentence_output, sentence_labels, sentence_loss) return final_loss
def call(self, inputs): """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) word_embeddings = unpacked_inputs[0] token_type_ids = unpacked_inputs[1] input_shape = tf_utils.get_shape_list(word_embeddings, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = word_embeddings if self.use_type_embeddings: flat_token_type_ids = tf.reshape(token_type_ids, [-1]) token_type_embeddings = tf.gather(self.type_embeddings, flat_token_type_ids) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if self.use_position_embeddings: position_embeddings = tf.expand_dims(tf.slice( self.position_embeddings, [0, 0], [seq_length, width]), axis=0) output += position_embeddings output = self.output_layer_norm(output) output = self.output_dropout(output) return output
def call(self, inputs, **kwargs): """Implements call() for the layer.""" (from_tensor, to_tensor, attention_mask) = tf_utils.unpack_inputs(inputs) # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # `query_tensor` = [B, F, N ,H] query_tensor = self.query_dense(from_tensor) # `key_tensor` = [B, T, N, H] key_tensor = self.key_dense(to_tensor) # `value_tensor` = [B, T, N, H] value_tensor = self.value_dense(to_tensor) # Take the dot product between "query" and "key" to get the raw # attention scores. attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_tensor, query_tensor) attention_scores = tf.multiply( attention_scores, 1.0 / math.sqrt(float(self.size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, attention_scores.dtype)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # reshape to [b*n, f, t] shapes = attention_scores.shape attention_scores = tf.reshape(attention_scores, [-1, shapes[2], shapes[3]]) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.attention_probs_dropout(attention_probs, training=kwargs.get( 'training', False)) # `context_layer` = [B, F, N, H] context_tensor = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_tensor) return context_tensor, attention_scores
def call(self, inputs): # """Implements call() for the layer.""" # unpacked_inputs = tf_utils.unpack_inputs(inputs) # lm_output = unpacked_inputs[0] # sentence_output = unpacked_inputs[1] # lm_label_ids = unpacked_inputs[2] # lm_label_weights = unpacked_inputs[3] # sentence_labels = unpacked_inputs[4] # lm_label_weights = tf.cast(lm_label_weights, tf.float32) # lm_output = tf.cast(lm_output, tf.float32) # sentence_output = tf.cast(sentence_output, tf.float32) # mask_label_loss = losses.loss( # labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights) # sentence_loss = losses.loss( # labels=sentence_labels, predictions=sentence_output) # loss = mask_label_loss + sentence_loss # batch_shape = tf.slice(tf.shape(sentence_labels), [0], [1]) # # TODO(hongkuny): Avoids the hack and switches add_loss. # final_loss = tf.fill(batch_shape, loss) # print(batch_shape, final_loss) # self._add_metrics(lm_output, lm_label_ids, lm_label_weights, # mask_label_loss, sentence_output, sentence_labels, # sentence_loss) # return final_loss """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) lm_output = unpacked_inputs[0] sentence_output = unpacked_inputs[1] lm_label_ids = unpacked_inputs[2] lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1]) lm_label_ids_one_hot = tf.keras.backend.one_hot(lm_label_ids, self.config.vocab_size) lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3], tf.float32) lm_label_weights = tf.keras.backend.reshape(lm_label_weights, [-1]) lm_per_example_loss = -tf.keras.backend.sum( lm_output * lm_label_ids_one_hot, axis=[-1]) numerator = tf.keras.backend.sum(lm_label_weights * lm_per_example_loss) denominator = tf.keras.backend.sum(lm_label_weights) + 1e-5 mask_label_loss = numerator / denominator sentence_labels = unpacked_inputs[4] sentence_labels = tf.keras.backend.reshape(sentence_labels, [-1]) sentence_label_one_hot = tf.keras.backend.one_hot(sentence_labels, 2) per_example_loss_sentence = -tf.keras.backend.sum( sentence_label_one_hot * sentence_output, axis=-1) sentence_loss = tf.keras.backend.mean(per_example_loss_sentence) loss = mask_label_loss + sentence_loss # TODO(hongkuny): Avoids the hack and switches add_loss. final_loss = tf.fill( tf.keras.backend.shape(per_example_loss_sentence), loss) self._add_metrics(lm_output, lm_label_ids, lm_label_weights, lm_per_example_loss, sentence_output, sentence_labels, per_example_loss_sentence) return final_loss
def call(self, inputs): """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) pooled_output = unpacked_inputs[0] sequence_output = unpacked_inputs[1] masked_lm_positions = unpacked_inputs[2] mask_lm_input_tensor = tf_utils.gather_indexes(sequence_output, masked_lm_positions) lm_output = self.lm_dense(mask_lm_input_tensor) lm_output = self.lm_layer_norm(lm_output) lm_output = tf.matmul(lm_output, self.embedding_table, transpose_b=True) lm_output = tf.nn.bias_add(lm_output, self.output_bias) lm_output = tf.nn.log_softmax(lm_output, axis=-1) logits = tf.matmul(pooled_output, self.next_seq_weights, transpose_b=True) logits = tf.nn.bias_add(logits, self.next_seq_bias) sentence_output = tf.nn.log_softmax(logits, axis=-1) return (lm_output, sentence_output, logits)
def call(self, inputs, return_all_layers=False, **kwargs): """Implements call() for the layer. Args: inputs: packed inputs. return_all_layers: bool, whether to return outputs of all layers inside encoders. Returns: Output tensor of the last layer or a list of output tensors. """ unpacked_inputs = tf_utils.unpack_inputs(inputs) input_tensor = unpacked_inputs[0] attention_mask = unpacked_inputs[1] output_tensor = input_tensor all_layer_outputs = [] for layer in self.layers: output_tensor = layer(output_tensor, attention_mask, **kwargs) all_layer_outputs.append(output_tensor) if return_all_layers: return all_layer_outputs return all_layer_outputs[-1]
def call(self, inputs, **kwargs): """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) sequence_output = unpacked_inputs[0] p_mask = unpacked_inputs[1] cls_index = unpacked_inputs[2] start_positions = unpacked_inputs[3] _, seq_len, _ = sequence_output.shape.as_list() sequence_output = tf.transpose(sequence_output, [1, 0, 2]) start_logits = self.start_logits_proj_layer(sequence_output) start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0]) start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) if kwargs.get("training", False): # during training, compute the end logits based on the # ground truth of the start position start_positions = tf.reshape(start_positions, [-1]) start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum( 'lbh,bl->bh', sequence_output, start_index) start_features = tf.tile(start_features[None], [seq_len, 1, 1]) end_logits = self.end_logits_proj_layer0( tf.concat([sequence_output, start_features], axis=-1)) end_logits = self.end_logits_layer_norm(end_logits) end_logits = self.end_logits_proj_layer1(end_logits) end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0]) end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) else: start_top_log_probs, start_top_index = tf.nn.top_k( start_log_probs, k=self.start_n_top) start_index = tf.one_hot( start_top_index, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum( 'lbh,bkl->bkh', sequence_output, start_index) end_input = tf.tile(sequence_output[:, :, None], [ 1, 1, self.start_n_top, 1]) start_features = tf.tile(start_features[None], [seq_len, 1, 1, 1]) end_input = tf.concat([end_input, start_features], axis=-1) end_logits = self.end_logits_proj_layer0(end_input) end_logits = tf.reshape(end_logits, [seq_len, -1, self.hidden_size]) end_logits = self.end_logits_layer_norm(end_logits) end_logits = tf.reshape(end_logits, [seq_len, -1, self.start_n_top, self.hidden_size]) end_logits = self.end_logits_proj_layer1(end_logits) end_logits = tf.reshape( end_logits, [seq_len, -1, self.start_n_top]) end_logits = tf.transpose(end_logits, [1, 2, 0]) end_logits_masked = end_logits * ( 1 - p_mask[:, None]) - 1e30 * p_mask[:, None] end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) end_top_log_probs, end_top_index = tf.nn.top_k( end_log_probs, k=self.end_n_top) end_top_log_probs = tf.reshape(end_top_log_probs, [-1, self.start_n_top * self.end_n_top]) end_top_index = tf.reshape(end_top_index, [-1, self.start_n_top * self.end_n_top]) # an additional layer to predict answerability # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum('lbh,bl->bh', sequence_output, cls_index) # get the representation of START start_p = tf.nn.softmax(start_logits_masked, axis=-1, name='softmax_start') start_feature = tf.einsum('lbh,bl->bh', sequence_output, start_p) ans_feature = tf.concat([start_feature, cls_feature], -1) ans_feature = self.answer_class_proj_layer0(ans_feature) ans_feature = self.ans_feature_dropout( ans_feature, training=kwargs.get('training', False)) cls_logits = self.answer_class_proj_layer1(ans_feature) cls_logits = tf.squeeze(cls_logits, -1) if kwargs.get("training", False): return (start_log_probs, end_log_probs, cls_logits) else: return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)