def build_net(_reshape_list_vec, need_pos=True): pos_embedding = tf.get_variable( shape=[_rank_size, hidden_size], dtype=tf.float32, initializer=tf.initializers.truncated_normal(mean=0.0, stddev=0.01), trainable=True, name="pos_embedding") hidden_reshape_list_vec = tf.layers.dense(reshape_list_vec, hidden_size) hidden_list_vec = tf.reshape(hidden_reshape_list_vec, [-1, 20, hidden_size]) if need_pos: hidden_list_vec = hidden_list_vec + pos_embedding # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. outputs = modeling.transformer_model( input_tensor=hidden_list_vec, hidden_size=hidden_size, num_hidden_layers=2, num_attention_heads=1, intermediate_size=hidden_size * 4, do_return_all_layers=False) outputs = tf.squeeze(tf.layers.dense(outputs, 1, activation=None), axis=-1) return outputs
def __init__(self, config, input_embedding, input_mask=None): input_shape = modeling.get_shape_list(input_embedding, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) # Keep variable names the same as BERT with tf.variable_scope("bert"): with tf.variable_scope("encoder"): attention_mask = modeling.create_attention_mask_from_input_mask( input_embedding, input_mask) all_encoder_layers = modeling.transformer_model( input_tensor=input_embedding, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = all_encoder_layers[-1]
def build_bert_model(self, input_ids, input_mask, token_type_ids): with tf.variable_scope('bert'): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (embedding_output, _) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=self.bert_config.vocab_size, embedding_size=self.bert_config.hidden_size, initializer_range=self.bert_config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. embedding_output = modeling.embedding_postprocessor( input_tensor=embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=self.bert_config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=self.bert_config.initializer_range, max_position_embeddings=self.bert_config. max_position_embeddings, dropout_prob=self.bert_config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( input_ids, input_mask) # Run the stacked transformer, only fetching the final lyaer # `final_layer` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=embedding_output, attention_mask=attention_mask, hidden_size=self.bert_config.hidden_size, num_hidden_layers=self.bert_config.num_hidden_layers, num_attention_heads=self.bert_config.num_attention_heads, intermediate_size=self.bert_config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.bert_config.hidden_act), hidden_dropout_prob=self.bert_config.hidden_dropout_prob, attention_probs_dropout_prob=\ self.bert_config.attention_probs_dropout_prob, initializer_range=self.bert_config.initializer_range, do_return_all_layers=True ) self.sequence_output = self.all_encoder_layers[-1]
def __init__(self, config, is_training, input_ids, image_embeddings, input_mask=None, token_type_ids=None, use_one_hot_embeddings=False, scope=None): """Constructor for a visually grounded BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2) batch_size = text_input_shape[0] text_seq_length = text_input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, text_seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, text_seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) # Add image embeddings the rest of the input embeddings. self.embedding_output += tf.layers.dense( image_embeddings, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.embedding_output, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def __init__(self, max_entity_num, max_sentence_num, max_relation_num, max_seq_length, entity_types, class_num, bert_config, hidden_size, hidden_layers, attention_heads, intermediate_size, hidden_dropout_prob, attention_probs_dropout_prob, graph_hidden_dropout_prob, graph_attention_probs_dropout_prob): max_node_num = max_sentence_num + max_entity_num self.input_ids = tf.placeholder(shape=[None, max_seq_length], dtype=tf.int32, name="input_ids") self.input_mask = tf.placeholder(shape=[None, max_seq_length], dtype=tf.int32, name="input_mask") self.segment_ids = tf.placeholder(shape=[None, max_seq_length], dtype=tf.int32, name="segment_ids") self.entity_types = tf.placeholder(shape=[None, max_node_num], dtype=tf.int32, name="entity_types") self.entity_mask = tf.placeholder( shape=[None, max_entity_num, max_seq_length], dtype=tf.float32, name="entity_mask") self.sentence_mask = tf.placeholder( shape=[None, max_sentence_num, max_seq_length], dtype=tf.float32, name="sentence_mask") self.relation_mask = tf.placeholder(shape=[None, max_relation_num], dtype=tf.float32, name="relation_mask") self.attention_mask = tf.placeholder( shape=[None, max_node_num, max_node_num], dtype=tf.float32, name="graph_mask") self.head_mask = tf.placeholder( shape=[None, max_relation_num, max_entity_num], dtype=tf.float32, name="head_mask") self.tail_mask = tf.placeholder( shape=[None, max_relation_num, max_entity_num], dtype=tf.float32, name="tail_mask") self.multi_labels = tf.placeholder( shape=[None, max_relation_num, class_num], dtype=tf.int32, name="multi_labels") self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") self.hidden_dropout_prob = tf.cond(self.is_training, lambda: hidden_dropout_prob, lambda: 0.0) self.attention_probs_dropout_prob = tf.cond( self.is_training, lambda: attention_probs_dropout_prob, lambda: 0.0) self.graph_hidden_dropout_prob = tf.cond( self.is_training, lambda: graph_hidden_dropout_prob, lambda: 0.0) self.graph_attention_probs_dropout_prob = tf.cond( self.is_training, lambda: graph_attention_probs_dropout_prob, lambda: 0.0) self.entity_type_embedding = tf.get_variable( shape=[entity_types, 32], dtype=tf.float32, name="entity_type_embedding") self.entity_type_rep = tf.nn.embedding_lookup( self.entity_type_embedding, self.entity_types) self.seq_rep = self.bert_encoder(bert_config, self.hidden_dropout_prob, self.attention_probs_dropout_prob, self.input_ids, self.input_mask, self.segment_ids) self.entity_rep = tf.matmul(self.entity_mask, self.seq_rep) self.sentence_rep = tf.matmul(self.sentence_mask, self.seq_rep) self.graph_rep = tf.concat([self.entity_rep, self.sentence_rep], axis=1) self.graph_rep = tf.concat([self.graph_rep, self.entity_type_rep], axis=-1) self.graph_rep = tf.layers.dense(self.graph_rep, hidden_size, tf.nn.relu) self.final_rep = modeling.transformer_model( input_tensor=self.graph_rep, attention_mask=self.attention_mask, hidden_size=hidden_size, num_hidden_layers=hidden_layers, num_attention_heads=attention_heads, intermediate_size=intermediate_size, hidden_dropout_prob=self.graph_hidden_dropout_prob, attention_probs_dropout_prob=self. graph_attention_probs_dropout_prob) self.entity_rep = self.final_rep[:, :max_entity_num] self.head_rep = tf.matmul(self.head_mask, self.entity_rep) self.tail_rep = tf.matmul(self.tail_mask, self.entity_rep) bi_hidden_size = self.head_rep.get_shape().as_list()[-1] self.logits = self.bilinear_function(self.head_rep, self.tail_rep, bi_hidden_size, class_num) self.sigmoid = tf.sigmoid(self.logits, name='sigmoid') self.entropy = tf.losses.sigmoid_cross_entropy( self.multi_labels, self.logits, reduction=tf.losses.Reduction.NONE) self.loss = tf.reduce_sum( tf.multiply(self.entropy, tf.expand_dims(self.relation_mask, axis=-1))) / tf.reduce_sum( self.relation_mask)
def __init__(self, config, is_training, input_tensor, input_mask, token_type_ids): """Constructor for BertFlexEmbeddingModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_tensor: float32 Tensor of shape [batch_size, seq_length, hidden_size]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE): with tf.variable_scope("embeddings"): # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=input_tensor, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( input_tensor, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def create_mask_model(bert_config, is_training, input_ids, input_mask, segment_ids, mask_positions, use_one_hot_embeddings): """Creates a classification model.""" #print("create mask model ----------------------------------------------") model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # Get the logits for the start and end predictions. final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/nq/output_weights", [2, hidden_size + 12], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/nq/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) mask_positions_matrix = tf.cast(tf.reshape(mask_positions, [batch_size * seq_length, 1]), dtype=tf.float32) padding = tf.zeros([batch_size * seq_length, 11], dtype=tf.float32) mask_positions_matrix = tf.concat([mask_positions_matrix, padding], axis=-1) final_hidden_matrix = tf.concat( [final_hidden_matrix, mask_positions_matrix], axis=-1) final_hidden_matrix = tf.reshape( final_hidden_matrix, [batch_size, seq_length, hidden_size + 12]) attention_mask = modeling.create_attention_mask_from_input_mask( input_ids, input_mask) config = bert_config all_encoder_layers = modeling.transformer_model( input_tensor=final_hidden_matrix, attention_mask=attention_mask, hidden_size=config.hidden_size + 12, # input hidden size num_hidden_layers=1, #config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) #print(all_encoder_layers.shape) transformer_output_matrix = all_encoder_layers[-1] transformer_output_matrix = tf.reshape( transformer_output_matrix, [batch_size * seq_length, hidden_size + 12]) logits = tf.matmul(transformer_output_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) # Get the logits for the answer type prediction. answer_type_output_layer = model.get_pooled_output() answer_type_hidden_size = answer_type_output_layer.shape[-1].value num_answer_types = 5 # YES, NO, UNKNOWN, SHORT, LONG answer_type_output_weights = tf.get_variable( "answer_type_output_weights", [num_answer_types, answer_type_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) answer_type_output_bias = tf.get_variable( "answer_type_output_bias", [num_answer_types], initializer=tf.zeros_initializer()) answer_type_logits = tf.matmul(answer_type_output_layer, answer_type_output_weights, transpose_b=True) answer_type_logits = tf.nn.bias_add(answer_type_logits, answer_type_output_bias) return (start_logits, end_logits, answer_type_logits)
def __init__(self, config, features, _rank_size, trainable=True, scope="train", batch_size=None, training=True): rnn_hidden_size = 128 hidden_size = 128 go_emb = tf.get_variable(shape=[rnn_hidden_size], dtype=tf.float32, initializer=tf.initializers.truncated_normal(mean=0.0, stddev=0.01), name="go_embedding") encoder_gru_fn = tf.keras.layers.GRU(rnn_hidden_size, return_state=True, return_sequences=True) cand_vec = features['features'] cand_len = tf.string_to_number(features['features_mask'], out_type=tf.int32) cand_mask = tf.sequence_mask(cand_len, utils.seq_max_len(config, 'features')) encoder_vec, _ = encoder_gru_fn(cand_vec, mask=cand_mask) pos_embedding = tf.get_variable(shape=[_rank_size, hidden_size], dtype=tf.float32, initializer=tf.initializers.truncated_normal(mean=0.0, stddev=0.01), trainable=True, name="pos_embedding") hidden_cand_vec = tf.layers.dense(cand_vec, hidden_size) hidden_cand_vec = hidden_cand_vec + pos_embedding # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. encoder_vec = modeling.transformer_model( input_tensor=hidden_cand_vec, hidden_size=hidden_size, num_hidden_layers=2, num_attention_heads=1, intermediate_size=hidden_size * 4, do_return_all_layers=False) self._point_vec = encoder_vec decoder_gru_fn = tf.keras.layers.GRU(rnn_hidden_size, return_state=True) list_vec = features['selected_vec'] # [B, seq_len, dense_dim] list_vec = tf.layers.dense(list_vec, hidden_size, activation=tf.nn.relu, name='dense1', reuse=tf.AUTO_REUSE) list_vec = tf.layers.dense(list_vec, hidden_size, activation=None, name='dense2', reuse=tf.AUTO_REUSE) seq_len = tf.string_to_number(features['selected_vec_mask'], out_type=tf.int32) seq_mask = tf.sequence_mask(seq_len, utils.seq_max_len(config, 'features')) selected_len = tf.string_to_number(features['selected_vec_mask'], out_type=tf.float32) not_first = tf.minimum(selected_len, 1.0) # outputs[B, seq_len, embedding_size] _, state = decoder_gru_fn(list_vec, mask=seq_mask) go_vec = tf.matmul(tf.expand_dims(1 - not_first, axis=1), tf.expand_dims(go_emb, axis=0)) query_vec = go_vec + tf.expand_dims(not_first, axis=1) * state def output_fn(_query_vec, _point_vec): atten_vec = tf.expand_dims(_query_vec, axis=1) * _point_vec logits = tf.reduce_sum(atten_vec, axis=-1) rank_mask = features["rank_mask"] # mask logist neg_mask = rank_mask - tf.ones(shape=[1, _rank_size], dtype=tf.float32) neg_mask = neg_mask * 1000 action_distribution = tf.nn.softmax(logits + neg_mask) return action_distribution self._action_distribution = output_fn(query_vec, encoder_vec) if 'point_vec' in features and 'last_state' in features and 'last_vec' in features: inc_seq_len = tf.minimum(seq_len, 1) inc_seq_mask = tf.sequence_mask(inc_seq_len, 1) last_vec = features['last_vec'] last_vec = tf.layers.dense(last_vec, hidden_size, activation=tf.nn.relu, name='dense1', reuse=tf.AUTO_REUSE) last_vec = tf.layers.dense(last_vec, hidden_size, activation=None, name='dense2', reuse=tf.AUTO_REUSE) _, next_state = decoder_gru_fn(tf.expand_dims(last_vec, axis=1), initial_state=features['last_state'], mask=inc_seq_mask) inc_query_vec = go_vec + tf.expand_dims(not_first, axis=1) * next_state self._inc_action_distribution = output_fn(inc_query_vec, features['point_vec']) self._next_state = next_state
def main(args): bert_config = modeling.BertConfig.from_json_file(args.config) bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 batch_size = args.batch_size avg_seq_len = args.avg_seq_length max_seq_len = args.max_seq_length tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32 # fake input array length input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len, high=max_seq_len + 1, size=(batch_size), dtype=np.int32) valid_word_num = sum(input_len) # fake input id and mask input_ids = np.random.randint(low=0, high=bert_config.vocab_size, size=(batch_size, max_seq_len), dtype=np.int32) input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32) for b_idx, s_len in enumerate(input_len): input_mask[b_idx][:s_len] = 1 input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32) input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32) # fake embedding output embed_output = np.random.randn(batch_size, max_seq_len, bert_config.hidden_size) input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype) # keep attention_mask for compatible reason att_mask = np.tile(input_mask, max_seq_len) att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len) attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype) # input info valid_word_num = sum(input_len) print("Valid word num : {}/{}, avg sequence length : {:.6} ".format( valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size)) # bert with standard transformer std_bert = modeling.transformer_model( input_tensor=input_tensor, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=modeling.get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=False) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: # init weights sess.run(tf.global_variables_initializer()) # get transformer weights all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) transformer_vars = [v for v in all_vars if v.name.startswith('layer')] weights_value = sess.run(transformer_vars) # bert with effective transformer et_bert = effective_transformer.get_sequence_output( max_batch_size=batch_size, max_seq_length=max_seq_len, config=bert_config, attention_mask=attention_mask, input_mask=input_mask_tensor, from_tensor=input_tensor, weights_value=weights_value, ) # diff val1 = sess.run(std_bert).reshape(-1, 768) val2 = sess.run(et_bert).reshape(-1, 768) diff = [] for b_idx, s_len in enumerate(input_len): for w_idx in range(s_len): idx = b_idx * args.max_seq_length + w_idx diff.append(np.fabs(val1[idx] - val2[idx]).max()) print("max diff : {:.6}, avg diff : {:.6}.".format( max(diff), sum(diff) / len(diff))) def time_inference(output_tensor): iter_num = 128 # warm up for i in range(10): sess.run(output_tensor) beg = datetime.now() for i in range(iter_num): sess.run(output_tensor) end = datetime.now() return (end - beg).total_seconds() * 1000 / iter_num # ms print("xla cost : {:.6} ms".format(time_inference(std_bert))) print("et cost : {:.6} ms".format(time_inference(et_bert)))
def __init__(self, config, use_one_hot_embeddings=True, num_labels=2, max_seq_length=128): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ self.input_ids = tf.placeholder(dtype=tf.int32, shape=(None, max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int8, shape=(None, max_seq_length)) config = copy.deepcopy(config) input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=self.input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.input_ids, self.input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) # define output_weights and output_bias hidden_size = self.pooled_output.shape[-1].value with tf.variable_scope("", reuse=tf.AUTO_REUSE): self.output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) self.output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer())
def create_bilstm_classification_model(bert_config, is_training, response_input_ids, response_input_mask, response_segment_ids, response_text_len, response_labels, random_forward_input_ids, random_forward_input_mask, random_forward_segment_ids, random_forward_text_len, random_backward_input_ids, random_backward_input_mask, random_backward_segment_ids, random_backward_text_len, random_labels, swap_forward_input_ids, swap_forward_input_mask, swap_forward_segment_ids, swap_forward_text_len, swap_backward_input_ids, swap_backward_input_mask, swap_backward_segment_ids, swap_backward_text_len, swap_labels, nli_forward_input_ids, nli_forward_input_mask, nli_forward_segment_ids, nli_forward_text_len, nli_backward_input_ids, nli_backward_input_mask, nli_backward_segment_ids, nli_backward_text_len, nli_labels, num_nli_labels, use_one_hot_embeddings, l2_reg_lambda=0.1, dropout_rate=1.0, lstm_size=None, num_layers=1): config = copy.deepcopy(bert_config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): (response_embedding_output, response_embedding_table) = modeling.embedding_lookup( input_ids=response_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) response_embedding_output = modeling.embedding_postprocessor( input_tensor=response_embedding_output, use_token_type=not config.roberta, token_type_ids=response_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # random detection # Perform embedding lookup on the word ids. (random_foward_embedding_output, random_forward_embedding_table) = modeling.embedding_lookup( input_ids=random_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Perform embedding lookup on the word ids. (random_backward_embedding_output, random_backward_embedding_table) = modeling.embedding_lookup( input_ids=random_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. random_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) random_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # swap detection (swap_foward_embedding_output, swap_forward_embedding_table) = modeling.embedding_lookup( input_ids=swap_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (swap_backward_embedding_output, swap_backward_embedding_table) = modeling.embedding_lookup( input_ids=swap_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) swap_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) swap_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # # generic detection # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_forward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_backward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # generic_foward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_foward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_forward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # generic_backward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_backward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_backward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # nli detection (nli_foward_embedding_output, nli_forward_embedding_table) = modeling.embedding_lookup( input_ids=nli_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (nli_backward_embedding_output, nli_backward_embedding_table) = modeling.embedding_lookup( input_ids=nli_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) nli_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) nli_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): response_attention_mask = modeling.create_attention_mask_from_input_mask( response_input_ids, response_input_mask) # [batch_size, from_seq_length, to_seq_length] # mask future tokens diag_vals = tf.ones_like(response_attention_mask[0, :, :]) tril = tf.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() future_masks = tf.tile(tf.expand_dims( tril, 0), [tf.shape(response_attention_mask)[0], 1, 1]) response_attention_mask = tf.math.multiply(response_attention_mask, future_masks) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. response_all_encoder_layers = modeling.transformer_model( input_tensor=response_embedding_output, attention_mask=response_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # random detection # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. random_forward_attention_mask = modeling.create_attention_mask_from_input_mask( random_forward_input_ids, random_forward_input_mask) random_backward_attention_mask = modeling.create_attention_mask_from_input_mask( random_backward_input_ids, random_backward_input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. random_forward_all_encoder_layers = modeling.transformer_model( input_tensor=random_foward_embedding_output, attention_mask=random_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_backward_all_encoder_layers = modeling.transformer_model( input_tensor=random_backward_embedding_output, attention_mask=random_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # swap detection swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_forward_input_ids, swap_forward_input_mask) swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_backward_input_ids, swap_backward_input_mask) swap_forward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_foward_embedding_output, attention_mask=swap_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) swap_backward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_backward_embedding_output, attention_mask=swap_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # # generic detection # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids, # generic_forward_input_mask) # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids, # generic_backward_input_mask) # generic_forward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_foward_embedding_output, # attention_mask=generic_forward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # generic_backward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_backward_embedding_output, # attention_mask=generic_backward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # nli detection nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_forward_input_ids, nli_forward_input_mask) nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_backward_input_ids, nli_backward_input_mask) nli_forward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_foward_embedding_output, attention_mask=nli_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) nli_backward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_backward_embedding_output, attention_mask=nli_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_forward_embedding = random_forward_all_encoder_layers[-2] random_backward_embedding = random_backward_all_encoder_layers[-2] swap_forward_embedding = swap_forward_all_encoder_layers[-2] swap_backward_embedding = swap_backward_all_encoder_layers[-2] # generic_forward_embedding = generic_forward_all_encoder_layers[-2] # generic_backward_embedding = generic_backward_all_encoder_layers[-2] nli_forward_embedding = nli_forward_all_encoder_layers[-2] nli_backward_embedding = nli_backward_all_encoder_layers[-2] response_embedding = response_all_encoder_layers[-2] response_embedding_shape = modeling.get_shape_list(response_embedding, expected_rank=3) with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE): response_logits = tf.layers.dense(response_embedding, config.hidden_size, activation=None) response_logits = modeling.gelu(response_logits) response_logits = modeling.layer_norm(response_logits) response_outputs = tf.layers.dense( response_logits, config.vocab_size, activation=None, use_bias=True, bias_initializer=tf.zeros_initializer()) response_one_hot = tf.one_hot(response_labels, depth=config.vocab_size, dtype=tf.float32) lm_cost = tf.nn.softmax_cross_entropy_with_logits( labels=response_one_hot, logits=response_outputs) sequence_mask = tf.sequence_mask(response_text_len, maxlen=response_embedding_shape[1], dtype=tf.float32) masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask) final_lm_loss = tf.reduce_mean( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) perplexity = tf.exp( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) random_forward_embedding_shape = modeling.get_shape_list( random_forward_embedding, expected_rank=3) random_backward_embedding_shape = modeling.get_shape_list( random_backward_embedding, expected_rank=3) assert random_forward_embedding_shape[ 2] == random_backward_embedding_shape[2] random_forward_embedding = tf.transpose(random_forward_embedding, [1, 0, 2]) random_backward_embedding = tf.transpose(random_backward_embedding, [1, 0, 2]) random_forward_input_mask = tf.cast( tf.transpose(random_forward_input_mask, [1, 0]), tf.float32) random_backward_input_mask = tf.cast( tf.transpose(random_backward_input_mask, [1, 0]), tf.float32) swap_forward_embedding_shape = modeling.get_shape_list( swap_forward_embedding, expected_rank=3) swap_backward_embedding_shape = modeling.get_shape_list( swap_backward_embedding, expected_rank=3) assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2] swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2]) swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2]) swap_forward_input_mask = tf.cast( tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32) swap_backward_input_mask = tf.cast( tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32) # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3) # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3) # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2] # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2]) # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2]) # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32) # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32) nli_forward_embedding_shape = modeling.get_shape_list( nli_forward_embedding, expected_rank=3) nli_backward_embedding_shape = modeling.get_shape_list( nli_backward_embedding, expected_rank=3) assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2] nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2]) nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2]) nli_forward_input_mask = tf.cast( tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32) nli_backward_input_mask = tf.cast( tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32) model = HadeModel( x_random_forward=random_forward_embedding, x_random_mask_forward=random_forward_input_mask, x_random_length_forward=random_forward_text_len, x_random_backward=random_backward_embedding, x_random_mask_backward=random_backward_input_mask, x_random_length_backward=random_backward_text_len, y_random=random_labels, x_swap_forward=swap_forward_embedding, x_swap_mask_forward=swap_forward_input_mask, x_swap_length_forward=swap_forward_text_len, x_swap_backward=swap_backward_embedding, x_swap_mask_backward=swap_backward_input_mask, x_swap_length_backward=swap_backward_text_len, y_swap=swap_labels, # x_generic_forward=generic_forward_embedding, # x_generic_mask_forward=generic_forward_input_mask, # x_generic_length_forward=generic_forward_text_len, # x_generic_backward=generic_backward_embedding, # x_generic_mask_backward=generic_backward_input_mask, # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels, x_nli_forward=nli_forward_embedding, x_nli_mask_forward=nli_forward_input_mask, x_nli_length_forward=nli_forward_text_len, x_nli_backward=nli_backward_embedding, x_nli_mask_backward=nli_backward_input_mask, x_nli_length_backward=nli_backward_text_len, y_nli=nli_labels, embedding_dim=random_forward_embedding_shape[2], num_nli_labels=num_nli_labels, hidden_size=lstm_size, l2_reg_lambda=l2_reg_lambda, num_layers=num_layers, dropout_rate=dropout_rate, is_training=is_training) random_prob, swap_prob, nli_prob, total_cost = model.create_model() return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity
def aggregate_embedding(embeddings, segment_idx, aggregator, config=None, aux=None, name=None): # segment_idx denotes different needles, rather than rows. if aggregator == 'segment_sqrt_n': denom = to_col( tf.sqrt( tf.to_float( tf.segment_sum(tf.ones_like(segment_idx), segment_idx)))) output_layer = tf.div_no_nan(tf.segment_sum(embeddings, segment_idx), denom, name=name) elif aggregator in ['segment_sum', 'segment_mean']: output_layer = getattr(tf, aggregator)(embeddings, segment_idx, name=name) else: del embeddings assert aggregator.startswith('transformer') flags = {} if '^' in aggregator: flags = [ kv.split('@') for kv in filter(None, aggregator.split('^')[1].split(',')) ] flags = {k: eval(v) for k, v in flags} assert config is not None and aux is not None needle_pos = aux['needle_pos'] embedding_output = aux['sequence_output'] batch_idx2 = aux['batch_idx2'] # different rows. is_training = aux['is_training'] attention_mask = get_dense_mask(needle_pos, batch_idx2, tf.shape(embedding_output)[:2]) with tf.variable_scope('final_transformer'): all_encoder_layers = modeling.transformer_model( input_tensor=embedding_output, attention_mask=attention_mask, hidden_size=config. hidden_size, # this must agree with input width. num_hidden_layers=flags.get('num_hidden_layers', 1), num_attention_heads=flags.get('num_attention_heads', config.num_attention_heads), intermediate_size=flags.get('intermediate_size', config.intermediate_size), intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=flags.get('hidden_dropout_prob', config.hidden_dropout_prob) * int(is_training), attention_probs_dropout_prob=int(is_training) * flags.get('attention_probs_dropout_prob', config.attention_probs_dropout_prob), initializer_range=config.initializer_range, do_return_all_layers=True) first_token_tensor = all_encoder_layers[-1][:, 0, :] output_layer = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) return output_layer