def __init__(self, config, input_ids, token_type_ids=None): input_shape = modeling.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) # Keep variable names the same as BERT with tf.variable_scope("bert"): with tf.variable_scope("embeddings"): (embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=False) self.embedding_output = modeling.embedding_postprocessor( input_tensor=embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob)
def build_bert_model(self, input_ids, input_mask, token_type_ids): with tf.variable_scope('bert'): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (embedding_output, _) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=self.bert_config.vocab_size, embedding_size=self.bert_config.hidden_size, initializer_range=self.bert_config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. embedding_output = modeling.embedding_postprocessor( input_tensor=embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=self.bert_config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=self.bert_config.initializer_range, max_position_embeddings=self.bert_config. max_position_embeddings, dropout_prob=self.bert_config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( input_ids, input_mask) # Run the stacked transformer, only fetching the final lyaer # `final_layer` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=embedding_output, attention_mask=attention_mask, hidden_size=self.bert_config.hidden_size, num_hidden_layers=self.bert_config.num_hidden_layers, num_attention_heads=self.bert_config.num_attention_heads, intermediate_size=self.bert_config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.bert_config.hidden_act), hidden_dropout_prob=self.bert_config.hidden_dropout_prob, attention_probs_dropout_prob=\ self.bert_config.attention_probs_dropout_prob, initializer_range=self.bert_config.initializer_range, do_return_all_layers=True ) self.sequence_output = self.all_encoder_layers[-1]
def run_bert_embeddings(input_ids, config): """Extract only the word embeddings of the original BERT model.""" with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. embedding_output, embedding_var = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=False) return embedding_output, embedding_var
def __init__(self, config, is_training, input_ids, image_embeddings, input_mask=None, token_type_ids=None, use_one_hot_embeddings=False, scope=None): """Constructor for a visually grounded BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2) batch_size = text_input_shape[0] text_seq_length = text_input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, text_seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, text_seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) # Add image embeddings the rest of the input embeddings. self.embedding_output += tf.layers.dense( image_embeddings, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.embedding_output, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def __init__(self, config, use_one_hot_embeddings=True, num_labels=2, max_seq_length=128): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ self.input_ids = tf.placeholder(dtype=tf.int32, shape=(None, max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int8, shape=(None, max_seq_length)) config = copy.deepcopy(config) input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=self.input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.input_ids, self.input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) # define output_weights and output_bias hidden_size = self.pooled_output.shape[-1].value with tf.variable_scope("", reuse=tf.AUTO_REUSE): self.output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) self.output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer())
def create_bilstm_classification_model(bert_config, is_training, response_input_ids, response_input_mask, response_segment_ids, response_text_len, response_labels, random_forward_input_ids, random_forward_input_mask, random_forward_segment_ids, random_forward_text_len, random_backward_input_ids, random_backward_input_mask, random_backward_segment_ids, random_backward_text_len, random_labels, swap_forward_input_ids, swap_forward_input_mask, swap_forward_segment_ids, swap_forward_text_len, swap_backward_input_ids, swap_backward_input_mask, swap_backward_segment_ids, swap_backward_text_len, swap_labels, nli_forward_input_ids, nli_forward_input_mask, nli_forward_segment_ids, nli_forward_text_len, nli_backward_input_ids, nli_backward_input_mask, nli_backward_segment_ids, nli_backward_text_len, nli_labels, num_nli_labels, use_one_hot_embeddings, l2_reg_lambda=0.1, dropout_rate=1.0, lstm_size=None, num_layers=1): config = copy.deepcopy(bert_config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): (response_embedding_output, response_embedding_table) = modeling.embedding_lookup( input_ids=response_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) response_embedding_output = modeling.embedding_postprocessor( input_tensor=response_embedding_output, use_token_type=not config.roberta, token_type_ids=response_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # random detection # Perform embedding lookup on the word ids. (random_foward_embedding_output, random_forward_embedding_table) = modeling.embedding_lookup( input_ids=random_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Perform embedding lookup on the word ids. (random_backward_embedding_output, random_backward_embedding_table) = modeling.embedding_lookup( input_ids=random_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. random_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) random_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # swap detection (swap_foward_embedding_output, swap_forward_embedding_table) = modeling.embedding_lookup( input_ids=swap_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (swap_backward_embedding_output, swap_backward_embedding_table) = modeling.embedding_lookup( input_ids=swap_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) swap_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) swap_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # # generic detection # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_forward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_backward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # generic_foward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_foward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_forward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # generic_backward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_backward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_backward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # nli detection (nli_foward_embedding_output, nli_forward_embedding_table) = modeling.embedding_lookup( input_ids=nli_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (nli_backward_embedding_output, nli_backward_embedding_table) = modeling.embedding_lookup( input_ids=nli_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) nli_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) nli_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): response_attention_mask = modeling.create_attention_mask_from_input_mask( response_input_ids, response_input_mask) # [batch_size, from_seq_length, to_seq_length] # mask future tokens diag_vals = tf.ones_like(response_attention_mask[0, :, :]) tril = tf.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() future_masks = tf.tile(tf.expand_dims( tril, 0), [tf.shape(response_attention_mask)[0], 1, 1]) response_attention_mask = tf.math.multiply(response_attention_mask, future_masks) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. response_all_encoder_layers = modeling.transformer_model( input_tensor=response_embedding_output, attention_mask=response_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # random detection # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. random_forward_attention_mask = modeling.create_attention_mask_from_input_mask( random_forward_input_ids, random_forward_input_mask) random_backward_attention_mask = modeling.create_attention_mask_from_input_mask( random_backward_input_ids, random_backward_input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. random_forward_all_encoder_layers = modeling.transformer_model( input_tensor=random_foward_embedding_output, attention_mask=random_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_backward_all_encoder_layers = modeling.transformer_model( input_tensor=random_backward_embedding_output, attention_mask=random_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # swap detection swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_forward_input_ids, swap_forward_input_mask) swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_backward_input_ids, swap_backward_input_mask) swap_forward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_foward_embedding_output, attention_mask=swap_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) swap_backward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_backward_embedding_output, attention_mask=swap_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # # generic detection # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids, # generic_forward_input_mask) # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids, # generic_backward_input_mask) # generic_forward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_foward_embedding_output, # attention_mask=generic_forward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # generic_backward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_backward_embedding_output, # attention_mask=generic_backward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # nli detection nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_forward_input_ids, nli_forward_input_mask) nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_backward_input_ids, nli_backward_input_mask) nli_forward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_foward_embedding_output, attention_mask=nli_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) nli_backward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_backward_embedding_output, attention_mask=nli_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_forward_embedding = random_forward_all_encoder_layers[-2] random_backward_embedding = random_backward_all_encoder_layers[-2] swap_forward_embedding = swap_forward_all_encoder_layers[-2] swap_backward_embedding = swap_backward_all_encoder_layers[-2] # generic_forward_embedding = generic_forward_all_encoder_layers[-2] # generic_backward_embedding = generic_backward_all_encoder_layers[-2] nli_forward_embedding = nli_forward_all_encoder_layers[-2] nli_backward_embedding = nli_backward_all_encoder_layers[-2] response_embedding = response_all_encoder_layers[-2] response_embedding_shape = modeling.get_shape_list(response_embedding, expected_rank=3) with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE): response_logits = tf.layers.dense(response_embedding, config.hidden_size, activation=None) response_logits = modeling.gelu(response_logits) response_logits = modeling.layer_norm(response_logits) response_outputs = tf.layers.dense( response_logits, config.vocab_size, activation=None, use_bias=True, bias_initializer=tf.zeros_initializer()) response_one_hot = tf.one_hot(response_labels, depth=config.vocab_size, dtype=tf.float32) lm_cost = tf.nn.softmax_cross_entropy_with_logits( labels=response_one_hot, logits=response_outputs) sequence_mask = tf.sequence_mask(response_text_len, maxlen=response_embedding_shape[1], dtype=tf.float32) masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask) final_lm_loss = tf.reduce_mean( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) perplexity = tf.exp( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) random_forward_embedding_shape = modeling.get_shape_list( random_forward_embedding, expected_rank=3) random_backward_embedding_shape = modeling.get_shape_list( random_backward_embedding, expected_rank=3) assert random_forward_embedding_shape[ 2] == random_backward_embedding_shape[2] random_forward_embedding = tf.transpose(random_forward_embedding, [1, 0, 2]) random_backward_embedding = tf.transpose(random_backward_embedding, [1, 0, 2]) random_forward_input_mask = tf.cast( tf.transpose(random_forward_input_mask, [1, 0]), tf.float32) random_backward_input_mask = tf.cast( tf.transpose(random_backward_input_mask, [1, 0]), tf.float32) swap_forward_embedding_shape = modeling.get_shape_list( swap_forward_embedding, expected_rank=3) swap_backward_embedding_shape = modeling.get_shape_list( swap_backward_embedding, expected_rank=3) assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2] swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2]) swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2]) swap_forward_input_mask = tf.cast( tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32) swap_backward_input_mask = tf.cast( tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32) # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3) # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3) # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2] # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2]) # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2]) # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32) # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32) nli_forward_embedding_shape = modeling.get_shape_list( nli_forward_embedding, expected_rank=3) nli_backward_embedding_shape = modeling.get_shape_list( nli_backward_embedding, expected_rank=3) assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2] nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2]) nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2]) nli_forward_input_mask = tf.cast( tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32) nli_backward_input_mask = tf.cast( tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32) model = HadeModel( x_random_forward=random_forward_embedding, x_random_mask_forward=random_forward_input_mask, x_random_length_forward=random_forward_text_len, x_random_backward=random_backward_embedding, x_random_mask_backward=random_backward_input_mask, x_random_length_backward=random_backward_text_len, y_random=random_labels, x_swap_forward=swap_forward_embedding, x_swap_mask_forward=swap_forward_input_mask, x_swap_length_forward=swap_forward_text_len, x_swap_backward=swap_backward_embedding, x_swap_mask_backward=swap_backward_input_mask, x_swap_length_backward=swap_backward_text_len, y_swap=swap_labels, # x_generic_forward=generic_forward_embedding, # x_generic_mask_forward=generic_forward_input_mask, # x_generic_length_forward=generic_forward_text_len, # x_generic_backward=generic_backward_embedding, # x_generic_mask_backward=generic_backward_input_mask, # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels, x_nli_forward=nli_forward_embedding, x_nli_mask_forward=nli_forward_input_mask, x_nli_length_forward=nli_forward_text_len, x_nli_backward=nli_backward_embedding, x_nli_mask_backward=nli_backward_input_mask, x_nli_length_backward=nli_backward_text_len, y_nli=nli_labels, embedding_dim=random_forward_embedding_shape[2], num_nli_labels=num_nli_labels, hidden_size=lstm_size, l2_reg_lambda=l2_reg_lambda, num_layers=num_layers, dropout_rate=dropout_rate, is_training=is_training) random_prob, swap_prob, nli_prob, total_cost = model.create_model() return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity
def create_model( bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, field_input_ids, ): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) # In the demo, we are doing a simple classification task on the entire # segment. # # If you want to use the token-level output, use model.get_sequence_output() # instead. output_layer = model.get_pooled_output() # [CLS]输出结果 hidden_size = output_layer.shape[-1].value field_input_ids_embedding = modeling.embedding_lookup( input_ids=field_input_ids, vocab_size=bert_config.vocab_size, embedding_size=bert_config.hidden_size, initializer_range=bert_config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings, )[0] # Three different types of non-linear layer # output_layer = methods.NN(output_layer, field_input_ids_embedding) # output_layer = methods.CNN(output_layer, field_input_ids_embedding) output_layer = methods.Bi_GRU(output_layer, field_input_ids_embedding) output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02), ) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer() ) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probabilities = tf.nn.softmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)