def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=False, scope=None): """Constructor for BertEncoder. Args: token_type_ids: maybe for positional embedding for encoder. TODO """ # create a copy of config, prevent from changing the original configuration. config = copy.deepcopy(config) if not is_training: config.hidden_drouput_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = ft.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) self.build_graph(config, input_ids, input_mask, token_type_ids, use_one_hot_embeddings)
def vae(state, num_units, scope='vae'): """VAE implementation, Hard Coding. The formula to calculate the vae loss: vae_loss = (-0.5 * tf.reduce_sum(1.0 + vae_vb - tf.square(vae_mean) - tf.exp(vae_vb)) / batch_size) * 0.001 """ shape = ft.get_shape_list(state, expected_rank=2) with tf.variable_scope(scope): vae_mean = tf.layers.dense(state, num_units, activation=tf.nn.tanh, name='vae_mean', kernel_initializer=ft.create_initializer()) vae_vb = tf.layers.dense(state, num_units, activation=tf.nn.tanh, name='vae_vb', kernel_initializer=ft.create_initializer()) eps = tf.random_normal([shape[0], num_units], 0.0, 1.0, dtype=tf.float32) z = vae_mean + tf.sqrt(tf.exp(vae_vb)) * eps return z, vae_mean, vae_vb
def __init__(self, config, is_training, encoder_state, embedding_table, decoder_intput_data=None, seq_length_decoder_input_data=None, scope=None): config = copy.deepcopy(config) self.is_training = is_training self.embedding_table = embedding_table input_shape = ft.get_shape_list(encoder_state, expected_rank=2) self.batch_size = input_shape[0] self.tgt_vocab_size = config.tgt_vocab_size self.unit_type = config.unit_type self.num_units = config.num_units self.forget_bias = config.forget_bias if not is_training: self.dropout = 0.0 else: self.dropout = config.dropout initializer_range = config.initializer_range self.tgt_sos_id = tf.constant(config.sos_id, dtype=tf.int32) self.tgt_eos_id = tf.constant(config.eos_id, dtype=tf.int32) self.max_len_infer = config.max_len_infer self.build_graph(encoder_state, initializer_range, seq_length_decoder_input_data, decoder_intput_data, scope)
def embedding_postprocessor(input_tensor, use_positional_embeddings=True, positional_embedding_name='positional_embeddings', initializer_range=0.02, max_positional_embeddings=512, dropout_prob=0.1): """Perform positional embeddings on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same sahpe as 'input_tensor'. """ input_shape = ft.get_shape_list(input_tensor, expected_rank=3) seq_length = input_shape[1] width = input_shape[2] if use_positional_embeddings: assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings) with tf.control_dependencies([assert_op]): full_positional_embeddings = tf.get_variable( name=positional_embedding_name, shape=[max_positional_embeddings, width], initializer=ft.create_initializer( initializer_range=initializer_range)) positional_embeddings = tf.slice( full_positional_embeddings, [0, 0], [seq_length, -1]) # [seq_length, width] positional_embeddings = tf.expand_dims(positional_embeddings, [0]) # [1, seq_length, width] output = input_tensor + positional_embeddings output = ft.layer_norm_and_dropout(output, dropout_prob) return output
def gather_indexs(sequence_output, sentiment_mask_indices): shape = ft.get_shape_list(sequence_output, expected_rank=3) batch_size = shape[0] seq_length = shape[1] width = shape[2] # [b, 1] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) # [b, x] flat_positions = tf.reshape(flat_offsets + sentiment_mask_indices, [-1]) # [b * s, w] flat_sequence_output = tf.reshape(sequence_output, [batch_size * seq_length, width]) # [b * x, w] output_tensor = tf.gather(flat_sequence_output, flat_positions) return output_tensor
def calculate_mse_loss(model_output, true_label, true_sequence): """This is used for calculating the mse loss. Args: model_output: (batch_size * seq_length, mask_padding_size). true_label: (batch, seq_length, mask_padding_size). true_sequence: (batch * seq_length * mask_padding_size). Returns: mse_loss: tf.float32. """ batch_size = tf.cast(ft.get_shape_list(model_output, expected_rank=2)[0], dtype=tf.float32) # flatten the tensor model_output_flatten = tf.reshape(model_output, [-1]) true_label_flatten = tf.reshape(true_label, [-1]) # get actual length without mask, cause the following mse calculation ignore the mask length = tf.reduce_sum(true_sequence) mse_loss = tf.reduce_sum( tf.pow((model_output_flatten - true_label_flatten), 2) * true_sequence) / batch_size return mse_loss
def embedding_lookup(input_ids, vocab_size, embedding_size, initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False): """Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialation range. word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True. use one-hot method for word embedding. If False, use 'tf.gather()'. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. """ embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=ft.create_initializer(initializer_range=initializer_range)) if use_one_hot_embeddings: input_shape = ft.get_shape_list(input_ids, expected_rank=2) input_ids_squeeze = tf.reshape(input_ids, [-1]) one_hot_input_ids = tf.one_hot(input_ids_squeeze, depth=vocab_size) output = tf.matmul(one_hot_input_ids, embedding_table) output = tf.reshape(output, [input_shape[0], input_shape[1], -1]) else: output = tf.nn.embedding_lookup(embedding_table, input_ids) return output, embedding_table
def transformer_model(input_tensor, attention_mask, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size, intermediate_act_fn, hidden_dropout_prob, attention_dropout_prob, initializer_range, do_return_all_layers=False): """Multi-headed, multi-layer Transformer from 'Attention is All you need'. This is almost an exact implementation of the original Transformer encoder. Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. """ if hidden_size % num_attention_heads != 0: raise ValueError( 'The hidden size ({}) is not a multiple of the number of attention head\ ({}).'.format(hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = ft.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] if input_width != hidden_size: raise ValueError( 'The width of the input tensor ({}) != hidden size ({}).'.format( input_width, hidden_size)) prev_output = input_tensor all_layers_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope('layer_{}'.format(layer_idx)): layer_input = prev_output with tf.variable_scope('attention'): with tf.variable_scope('self'): # [b, s, n * a] attention_head = attention_layer( input_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, query_act=None, key_act=None, value_act=None, attention_dropout_prob=attention_dropout_prob, initializer_range=initializer_range, batch_size=batch_size, seq_length=seq_length) with tf.variable_scope('output'): # [b, s, h] attention_output = tf.layers.dense( attention_head, hidden_size, kernel_initializer=ft.create_initializer( initializer_range=initializer_range)) attention_output = ft.dropout(attention_output, hidden_dropout_prob) attention_output = ft.layer_norm(attention_output + layer_input) with tf.variable_scope('intermediate'): # [b, s, i] intermediate_output = tf.layers.dense( attention_head, intermediate_size, activation=intermediate_act_fn, kernel_initializer=ft.create_initializer( initializer_range=initializer_range)) with tf.variable_scope('output'): # [b, s, h] layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=ft.create_initializer( initializer_range=initializer_range)) layer_output = ft.dropout(layer_output, hidden_dropout_prob) layer_output = ft.layer_norm(layer_output + attention_output) prev_output = layer_output all_layers_outputs.append(prev_output) if do_return_all_layers: return all_layers_outputs else: return all_layers_outputs[-1]
def model_fn(features, labels, mode, params): # features name and shape _info('*** Features ****') for name in sorted(features.keys()): tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape)) is_training = (mode == tf.estimator.ModeKeys.TRAIN) # get data input_x = features['input_x'] input_mask = features['input_mask'] if is_training: input_y = features['input_y'] seq_length = features['seq_length'] else: input_y = None seq_length = None # build encoder model = BertEncoder( config=cg.BertEncoderConfig, is_training=is_training, input_ids=input_x, input_mask=input_mask) embedding_table = model.get_embedding_table() encoder_output = tf.reduce_sum(model.get_sequence_output(), axis=1) # build decoder decoder_model = Decoder( config=cg.DecoderConfig, is_training=is_training, encoder_state=encoder_output, embedding_table=embedding_table, decoder_intput_data=input_y, seq_length_decoder_input_data=seq_length) logits, sample_id, ppl_seq, ppl = decoder_model.get_decoder_output() if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'sample_id': sample_id, 'ppls': ppl_seq} output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions) else: if mode == tf.estimator.ModeKeys.TRAIN: max_time = ft.get_shape_list(labels, expected_rank=2)[1] target_weights = tf.sequence_mask(seq_length, max_time, dtype=logits.dtype) batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=2)[0], tf.float32) loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) * target_weights) / batch_size learning_rate = tf.train.polynomial_decay(cg.learning_rate, tf.train.get_or_create_global_step(), cg.train_steps / 100, end_learning_rate=1e-4, power=1.0, cycle=False) lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate) optimizer = tf.train.AdamOptimizer(lr, name='optimizer') tvars = tf.trainable_variables() gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step()) # this is excellent, because it could display the result each step, i.e., each step equals to batch_size. # the output_spec, display the result every save checkpoints step. logging_hook = tf.train.LoggingTensorHook({'loss' : loss, 'ppl': ppl, 'lr': lr}, every_n_iter=cg.print_info_interval) output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # TODO raise NotImplementedError return output_spec
def model_fn(features, labels, mode, params): # features name and shape for name in sorted(features.keys()): tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape)) is_training = (mode == tf.estimator.ModeKeys.TRAIN) # get data input_data = features['input_data'] input_mask = features['input_mask'] if mode == tf.estimator.ModeKeys.TRAIN: sentiment_labels = features['sentiment_labels'] sentiment_mask_indices = features['sentiment_mask_indices'] true_length_from_data = features['true_length'] # build model model = BertEncoder( config=cg.BertEncoderConfig, is_training=is_training, input_ids=input_data, input_mask=input_mask) tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) # [cls] output -> [b, h] cls_output = model.get_cls_output() # sequence_output -> [b, s, h], do not contain [CLS], because the mask indices do not shift sequence_output = model.get_sequence_output()[:, 1:, :] # project the hidden size to the num_classes with tf.variable_scope('final_output'): # [b, num_classes] output_logits = tf.layers.dense( cls_output, cg.BertEncoderConfig.num_classes, name='final_output', kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range)) if mode == tf.estimator.ModeKeys.PREDICT: output_softmax = tf.nn.softmax(output_logits, axis=-1) output_result = tf.argmax(output_softmax, axis=-1) predictions = {'predict': output_result} output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions) else: if mode == tf.estimator.ModeKeys.TRAIN: # masked_output -> [b * x, h] masked_output = gather_indexs(sequence_output, sentiment_mask_indices) # get output for word polarity prediction with tf.variable_scope('sentiment_project'): # [b * x, 2] output_sentiment = tf.layers.dense( masked_output, 2, name='final_output', kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range)) # output_sentiment_probs = tf.nn.softmax(output_sentiment, axis=-1) batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=1)[0], dtype=tf.float32) # cross-entropy loss cls_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=output_logits)) / batch_size # mse loss # # Regression Model true_sequence = get_true_sequence(true_length_from_data) # mse_loss = calculate_mse_loss( # output_sentiment, sentiment_labels, true_sequence) # # Classification Model true_label_flatten = tf.reshape(sentiment_labels, [-1]) mse_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=true_label_flatten, logits=output_sentiment) * true_sequence) / tf.reduce_sum(true_sequence) loss = cls_loss + mse_loss # loss = cls_loss learning_rate = tf.train.polynomial_decay(cg.learning_rate, tf.train.get_or_create_global_step(), cg.train_steps, end_learning_rate=cg.lr_limit, power=1.0, cycle=False) lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate) optimizer = tf.train.AdamOptimizer(lr, name='optimizer') tvars = tf.trainable_variables() gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step()) current_steps = tf.train.get_or_create_global_step() logging_hook = tf.train.LoggingTensorHook( {'step' : current_steps, 'loss' : loss, 'cls_loss' : cls_loss, 'mse_loss': mse_loss, 'lr' : lr}, every_n_iter=cg.print_info_interval) output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # TODO raise NotImplementedError return output_spec