def build_key(self): with tf.compat.v1.variable_scope("embeddings"): input_tensor = self.get_embeddings(self.input_ids, self.segment_ids) self.input_shape = bc.get_shape_list(input_tensor, expected_rank=3) with tf.compat.v1.variable_scope("encoder"): self.attention_mask = bc.create_attention_mask_from_input_mask( input_tensor, self.input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.layers_before_key_pooling): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.forward_layer( prev_output) intermediate_output = tf.reshape(intermediate_output, [ self.batch_size * self.seq_length, self.config.intermediate_size ]) final_output = bc.reshape_from_matrix( prev_output, self.input_shape) self.all_layer_outputs.append(final_output) self.last_intermediate_output = intermediate_output self.last_key_layer = prev_output with tf.compat.v1.variable_scope("mr_key"): key_vectors = bc.dense(self.key_dimension, self.initializer)(intermediate_output) self.debug1 = key_vectors key_vectors = tf.reshape( key_vectors, [self.batch_size, self.seq_length, self.key_dimension]) key_output = self.key_pooling(key_vectors) return key_output
def build_by_attention(self, key): hidden_size = self.config.hidden_size with tf.compat.v1.variable_scope("embeddings"): lexical_tensor = self.get_lexical_lookup() self.embedding_output = self.embedding_postprocessor( d_input_ids=self.input_ids, input_tensor=lexical_tensor, use_token_type=True, token_type_ids=self.segment_ids, token_type_vocab_size=self.config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=self.config.initializer_range, max_position_embeddings=self.config.max_position_embeddings, dropout_prob=self.config.hidden_dropout_prob) input_tensor = self.embedding_output #[ def_per_batch, seq_length, hidden_size] with tf.compat.v1.variable_scope("encoder"): num_key_tokens = self.ssdr_config.num_key_tokens project_dim = hidden_size * num_key_tokens raw_key = bc.dense(project_dim, self.initializer)(key) key_tokens = tf.reshape( raw_key, [self.batch_size, num_key_tokens, hidden_size]) input_tensor = tf.concat([key_tokens, input_tensor], axis=1) input_shape = bc.get_shape_list(input_tensor, expected_rank=3) mask_for_key = tf.ones([self.batch_size, num_key_tokens], dtype=tf.int64) self.input_mask = tf.cast(self.input_mask, tf.int64) self.input_mask = tf.concat([mask_for_key, self.input_mask], axis=1) self.seq_length = self.seq_length + num_key_tokens self.attention_mask = bc.create_attention_mask_from_input_mask( input_tensor, self.input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.ssdr_config.num_hidden_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.forward_layer( prev_output) self.all_layer_outputs.append(prev_output) final_output = bc.reshape_from_matrix(prev_output, input_shape) self.scores = bc.dense(1, self.initializer)(final_output[:, 0, :]) if self.ssdr_config.info_pooling_method == "first_tokens": self.info_output = final_output[:, :num_key_tokens, :] elif self.ssdr_config.info_pooling_method == "max_pooling": self.info_output = tf.reduce_max(final_output, axis=1) return self.scores, self.info_output
def __init__(self, config, input_ids, input_mask, segment_ids, use_one_hot_embeddings): self.config = config self.use_one_hot_embeddings = use_one_hot_embeddings self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.batch_size, self.seq_length = get_batch_and_seq_length( input_ids, 2) self.initializer = base.create_initializer(config.initializer_range) self.attention_mask = bc.create_attention_mask_from_input_mask( input_ids, self.input_mask)
def build(self, value_out, locations): with tf.compat.v1.variable_scope("embeddings"): input_tensor = self.get_embeddings(self.input_ids, self.segment_ids) self.input_shape = bc.get_shape_list(input_tensor, expected_rank=3) with tf.compat.v1.variable_scope("encoder"): self.attention_mask = bc.create_attention_mask_from_input_mask( input_tensor, self.input_mask) prev_output = bc.reshape_to_matrix(input_tensor) prev_output = tf.tensor_scatter_nd_update(prev_output, locations, value_out) for layer_idx in range(self.config.num_hidden_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.forward_layer( prev_output) final_output = bc.reshape_from_matrix( prev_output, self.input_shape) self.all_layer_outputs.append(final_output) return self.all_layer_outputs
def build(self): with tf.compat.v1.variable_scope("dict"): with tf.compat.v1.variable_scope("embeddings"): input_tensor = self.get_embeddings(self.input_ids, self.segment_ids) with tf.compat.v1.variable_scope("encoder"): num_key_tokens = self.ssdr_config.num_key_tokens input_shape = bc.get_shape_list(input_tensor, expected_rank=3) mask_for_key = tf.ones([self.batch_size, num_key_tokens], dtype=tf.int64) self.input_mask = tf.cast(self.input_mask, tf.int64) self.input_mask = tf.concat([mask_for_key, self.input_mask], axis=1) self.seq_length = self.seq_length + num_key_tokens self.attention_mask = bc.create_attention_mask_from_input_mask( input_tensor, self.input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.ssdr_config.num_hidden_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.forward_layer( prev_output) self.all_layer_outputs.append(prev_output) final_output = bc.reshape_from_matrix(prev_output, input_shape) self.scores = bc.dense(1, self.initializer)(final_output[:, 0, :]) if self.ssdr_config.info_pooling_method == "first_tokens": self.info_output = final_output[:, :num_key_tokens, :] elif self.ssdr_config.info_pooling_method == "max_pooling": self.info_output = tf.reduce_max(final_output, axis=1) return self.scores, self.info_output
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. rue for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is must faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.compat.v1.variable_scope(scope, default_name="bert"): with tf.compat.v1.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.compat.v1.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = create_attention_mask_from_input_mask( input_ids, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers, key = transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, input_mask=input_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, is_training=is_training, #mr_layer=config.mr_layer, mr_num_route=config.mr_num_route, #mr_key_layer=config.mr_key_layer, intermediate_size=config.intermediate_size, intermediate_act_fn=get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.key = key self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.compat.v1.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.keras.layers.Dense(config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range))(first_token_tensor)