def assert_rank(tensor, expected_rank, name=None): """Check whether the rank of the 'tensor' matches the expected_rank. Remember rank is the number of the total dimensions. Args: tensor: A tf.Tensor to check. expected_rank: Python integer or list of intefers. name: (optional) name for the error. """ if name is None: name = tensor.name expected_rank_dict = {} # save the given rank into the dictionary if isinstance(expected_rank, six.integer_types): expected_rank_dict[expected_rank] = True else: for rank in expected_rank: expected_rank_dict[rank] = True tensor_rank = tensor.shape.ndims if tensor_rank not in expected_rank_dict: scope_name = tf.get_variable_scope().name _error( 'For the tensor {} in scope {}, the tensor rank {%d} \ (shape = {}) is not equal to the expected_rank {}'.format( name, scope_name, tensor_rank, str(tensor.shape), str(expected_rank))) raise ValueError
def _process_input(self, input_ids, max_length): assert len(input_ids) < max_length, _error( 'Input length is larger than the maximum length') question_length = len(input_ids) input_ids += [ vocab_idx['<mask>'] for _ in range(max_length - question_length) ] # input_ids[2] = 330 # input_ids[3] = 1470 # input_ids[4] = 1048 # input_ids[5] = 116 input_mask = [1 for _ in range(question_length) ] + [0 for _ in range(max_length - question_length)] input_mask = create_mask_for_seq(input_mask, question_length, max_length - question_length) # input_mask = [] # for _ in range(max_length): # temp = [1 for _ in range(question_length)] + [0 for _ in range(max_length - question_length)] # input_mask.append(temp) masked_lm_positions = [ question_length + idx for idx in range(max_length - question_length) ] return [input_ids], [input_mask], [masked_lm_positions]
def __setattr__(self, name, value): if hasattr(self, name): wrapped_setatrr(self, name, value) else: _error('Add new {} is forbidden'.format(name)) raise AttributeError
def train_generator(path, max_length, train_type=None): """"This is the entrance to the input_fn.""" if train_type == 'seq2seq': questions, answers, max_length = parse_data(path, train_type) for que, ans in zip(questions, answers): # 1. input_ids # use <mask> to represent the answer instead of the original 0 input_ids = que + [vocab_idx['<mask>'] for _ in range(len(ans)) ] # que + ans(represented by <mask>) padding_part = [ vocab_idx['<padding>'] for _ in range(max_length - len(input_ids)) ] # input_ids -> [5, 2, 1, 10, 10, 10, 0, 0, 0, 0], where supposing 10 is <mask>, 0 is <padding> input_ids += padding_part # [max_length] # 2. mask for attention scores # original input_mask in paper -> [1, 1, 1, 0, 0], however, use another mask here # where 1 indicates the question part, 0 indicates both the answer part and padding part. input_mask = [1 for _ in range(len(que)) ] + [0 for _ in range(len(ans + padding_part))] input_mask = create_mask_for_seq(input_mask, len(que), len(ans + padding_part)) # 3. masked_lm_positions saves the relative positions for answer part and padding part. # no padding masked_lm_positions -> [[2, 3, 4, 5, 6, 7, 8, 9], [5, 6, 7, 8, 9]] masked_lm_positions = [ len(que) + idx for idx in range(max_length - len(que)) ] # ATTENTION # the above `masked_lm_positions` of each data in a batch may not have the same length, # # # # # # # due to the various length of question, # # # # # # # so padding the `masked_lm_positions` to the same length as max_length is necessary, # # # # # # # although the padding items are fake, the following `mask_lm_weights` will handle this. # supposing the max_length equals to 10, the example no padding masked_lm_positions will look like # the following after the next step: # [[2, 3, 4, 5, 6, 7, 8, 9, 0, 0], [5, 6, 7, 8, 0, 0, 0, 0, 0, 0]] # The reason for using `0` to pad here, During training, the `masked_lm_positions` wii add the `flat_offset`, # the padding items do not exist, if add other numbers instead of 0, maybe cause index error. masked_lm_positions += [ 0 for idx in range(max_length - len(masked_lm_positions)) ] # 4. mask_lm_ids -> the actual labels mask_lm_ids = ans + padding_part # padding the `mask_lm_ids` to the max_length mask_lm_ids += [ vocab_idx['<padding>'] for _ in range(max_length - len(mask_lm_ids)) ] # 5. mask_lm_weights -> for calculate the actual loss, which help to ignore the padding part mask_lm_weights = [1 for _ in range(len(ans)) ] + [0 for _ in range(len(padding_part))] # padding mask_lm_weights += [ 0 for _ in range(max_length - len(mask_lm_weights)) ] # print(input_ids) # print(input_mask) # print(masked_lm_positions) # print(mask_lm_ids) # print(mask_lm_weights) # input() features = { 'input_ids': input_ids, 'input_mask': input_mask, 'masked_lm_positions': masked_lm_positions, 'masked_lm_ids': mask_lm_ids, 'masked_lm_weights': mask_lm_weights } yield features elif train_type == 'lm': sentences, max_length = parse_data(path, train_type) for line in sentences: input_ids = [vocab_idx['S']] padding_part = [ vocab_idx['<padding>'] for _ in range(max_length - len(input_ids)) ] input_ids += padding_part input_mask = create_mask_for_lm(max_length) masked_lm_positions = [ idx + 1 for idx in range(len(input_ids) - 1) ] masked_lm_positions += [ masked_lm_positions[-1] + 1 + idx for idx in range(len(input_ids) - len(masked_lm_positions)) ] mask_lm_ids = line + [ vocab_idx['<padding>'] for _ in range(len(input_ids) - len(line) - 1) ] mask_lm_ids += [ vocab_idx['<padding>'] for _ in range(len(input_ids) - len(mask_lm_ids)) ] mask_lm_weights = [1 for _ in range(len(line))] + [ 0 for _ in range(len(input_ids) - len(line) - 1) ] mask_lm_weights += [ 0 for _ in range(len(input_ids) - len(mask_lm_weights)) ] # print(line) # print(len(input_ids)) # print(len(input_mask)) # print(len(masked_lm_positions)) # print(len(mask_lm_ids)) # print(len(mask_lm_weights)) # input() features = { 'input_ids': input_ids, 'input_mask': input_mask, 'masked_lm_positions': masked_lm_positions, 'masked_lm_ids': mask_lm_ids, 'masked_lm_weights': mask_lm_weights } yield features else: _error('Non supported train type: {}'.format(train_type)) raise ValueError
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=3, token_type_embedding_name='token_type_embeddings', use_positional_embeddings=True, positional_embedding_type='normal', pre_positional_embeddings=None, positional_embedding_name='position_embeddings', initializer_range=0.01, max_positional_embeddings=512, dropout_prob=0.01): """Performs some preprocessing on the word embeddings. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add segment embeddings, very confused about the original comments uses 'token' as name, as I realized, token_type_ids would be [[0, 0, 1], [0, 1, 0]], 0 refers to the segment 1, and 1 refers to segment 2, the last 0 in the second array refers to the padding. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_vocab_size: the number of token types. use_positional_embeddings: bool. Whether to add positional embeddings. positional_embedding_type: ['normal', 'trigonometrical']. pre_positional_embeddings: postional embeddings for the pre_positional_embeddings. postional_embedding_name: string. The name of the embedding table variable. initializer_range: float. Range of the weight initializer. max_positional_embeddings: int. Maximum sequence length for each sentence, which should be equal to or longer than the sequence. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float Tensor with the identical shape as 'input_tensor'. """ input_shape = get_shape_list(input_tensor, expected_rank=[2,3]) batch_size, seq_length, width = input_shape[0], input_shape[1], input_shape[2] # create this variable in case of not use any pre-embeddings on the input_tensor output = input_tensor if use_token_type: if token_type_ids is None: _error('`token_type_ids` must be specified if `use_token_type` is True.') raise ValueError token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids) output += token_type_embeddings if use_positional_embeddings: assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings) with tf.control_dependencies([assert_op]): full_positional_embeddings = tf.get_variable( name=positional_embedding_name, shape=[max_positional_embeddings, width], initializer=create_initializer(initializer_range)) # the full_positional_embeddings is created under the maximum sequence length, # however, the actual length maybe less than the maximum length, so slicing is necessary. positional_embeddings = tf.slice(full_positional_embeddings, [0, 0], [seq_length, -1]) output += positional_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def model_fn(features, labels, mode, params): """this is prototype syntax, all parameters are necessary.""" # obtain the data _info('*** Features ***') for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features['input_ids'] # [batch_size, seq_length] input_mask = features['input_mask'] # [batch_size, seq_length] # if mode != tf.estimator.ModeKeys.PREDICT: # # segment_idx = features['segment_dis'] # masked_lm_positions = features['masked_lm_positions'] # [batch_size, seq_length], specify the answer # masked_lm_ids = features['masked_lm_ids'] # [batch_size, answer_seq_length], specify the answer labels # masked_lm_weights = features['masked_lm_weights'] # [batch_size, seq_length], [1, 1, 0], 0 refers to the mask # # next_sentence_labels = features['next_sentence_labels'] # else: masked_lm_positions = features['masked_lm_positions'] masked_lm_ids = features['masked_lm_ids'] masked_lm_weights = features['masked_lm_weights'] if bert_config.train_type == 'seq2seq': _info('Training seq2seq task.') elif bert_config.train_type == 'lm': _info('Training language model task.') # build model is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask) # compute loss loss, pre_loss, log_probs = get_masked_lm_output(bert_config, model.get_sequence_output(), model.embedding_table, model.projection_table, masked_lm_positions, masked_lm_ids, masked_lm_weights, mode) if mode == tf.estimator.ModeKeys.PREDICT: masked_lm_predictions = tf.reshape(tf.argmax(log_probs, axis=-1, output_type=tf.int32), [-1]) output_spec = tf.estimator.EstimatorSpec(mode, predictions=masked_lm_predictions) else: if mode == tf.estimator.ModeKeys.TRAIN: # restore from the checkpoint, # tf.estimator automatically restore from the model typically, # maybe here is for restore some pre-trained parameters tvars = tf.trainable_variables() initialized_variable_names = {} if init_checkpoint: (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) _info('*** Trainable Variables ***') for var in tvars: init_string = '' if var.name in initialized_variable_names: init_string = ', *INIT_FROM_CKPT*' _info('name = {}, shape={}{}'.format(var.name, var.shape, init_string)) train_op = optimization.create_optimizer( loss, bert_config.learning_rate, num_train_steps) # learning_rate = tf.train.polynomial_decay(bert_config.learning_rate, # tf.train.get_or_create_global_step(), # num_train_steps, # end_learning_rate=0.0, # power=1.0, # cycle=False) # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) # gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True) # clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) # train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step()) output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: # TODO define the metrics _error('to do ...') raise NotImplementedError return output_spec
input_fn = functools.partial(train_input_fn, path=bert_config.data_path, batch_size=bert_config.batch_size, repeat_num=bert_config.num_train_steps, max_length = bert_config.max_length) gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth=True run_config = tf.contrib.tpu.RunConfig( session_config=gpu_config, keep_checkpoint_max=1, save_checkpoints_steps=10, model_dir=bert_config.model_dir) estimaotr = tf.estimator.Estimator(model_fn, config=run_config) estimaotr.train(input_fn) # train_input_fn should be callable def package_model(ckpt_path, pb_path): model_fn = model_fn_builder(bert_config, None, bert_config.learning_rate, bert_config.num_train_steps) estimator = tf.estimator.Estimator(model_fn, ckpt_path) estimator.export_saved_model(pb_path, server_input_receiver_fn) if __name__ == '__main__': if sys.argv[1] == 'train': main() elif sys.argv[1] == 'package': package_model(str(PROJECT_PATH / 'models_lm'), str(PROJECT_PATH / 'models_deploy_lm')) else: _error('Unknown parameter: {}.'.format(sys.argv[1])) _info('Choose from [train | package].')
def tranformer_model(input_tensor, attention_mask=None, hidden_size=1024, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=_mh.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False, share_parameter_across_layers=True): """Multi-head, multi-layer Transformer. Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], where 1 indicates the position can be attended and 0 indicates the position cannot be attended. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the feed forward layer. intermediate_act_fn: activation function after feed forward layer. hidden_dropout_prob: float. attention_probs_dropout_prob: float. initializer_range: float. do_return_all_layers: bool. Return the output from all the hidden layers or just the final layer. share_parameter_across_layers: bool. Whether share parameters across each attention layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], or a list contains 'num_hidden_layers' float Tensor. """ if hidden_size % num_attention_heads != 0: _error( 'The hidden size {} cannot be divided by the number of attention heads {}' .format(hidden_size, num_attention_heads)) raise ValueError # the hidden size for each head attention_head_size = int(hidden_size / num_attention_heads) input_shape = _mh.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # residual layer need to perform on the outputs from all layers, # so the hidden size, i.e. the outputs from the transformer blocks # should be the same as the input_width, at the beginning, it is input tensor, # diffetentiate hidden_size from the intermediate_size, # intermediate layer is before the hidden layer. if input_width != hidden_size: _error( 'The width of the input tensor {} not not equal to the hidden size {}' .format(input_width, hidden_size)) raise ValueError # create a list to save the output from each transformer layer] prev_output = input_tensor # [batch_size, seq_length, width] all_layer_outputs = [] for layer_idx in range(num_hidden_layers): if share_parameter_across_layers: name_variable_scope = 'layer_shared' else: name_variable_scope = 'layer_{}'.format(layer_idx) # share the parameter across layers when share_parameter_across_layers us True and not the first layer with tf.variable_scope( name_variable_scope, reuse=True if (share_parameter_across_layers and layer_idx > 0) else False): layer_input = prev_output with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): attention_head = self_attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_output = attention_head # perform residual layer to finish the self-attention block with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=_mh.create_initializer( initializer_range)) attention_output = _mh.dropout(attention_output, hidden_dropout_prob) attention_output = _mh.layer_norm(attention_output + layer_input) # do double linear projection to enhance the context representation with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=_mh.create_initializer( initializer_range)) with tf.variable_scope('output'): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=_mh.create_initializer( initializer_range)) layer_output = _mh.dropout(layer_output, hidden_dropout_prob) layer_output = _mh.layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: return all_layer_outputs else: return all_layer_outputs[-1]
def self_attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, batch_size=None, from_seq_length=None, to_seq_length=None): """Perform self-attention. Args: from_tensor: float Tensor of shape [batch_size, seq_length, width]. to_tensor: float Tensor of shape [batch_size, seq_length, width]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], where 1 indicates the position can be attended and 0 indicates the position cannot be attended. num_attention_heads: int. Number of attention heads in the Transformer. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transformer. key_act: (optional) Activation function for the key transformer. value_act: (optional) Activation function for the value transformer. attention_probs_dropout_prob: (optional) float. initializer_range: float. batch_size: (optional) int. from_seq_length: (optional) int. to_seq_length: (optional) int. Returns: float Tensor of shape [batch_size, from_seq_length, width]. """ def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, size_per_head): """Change the order of axes. witdh = num_attention_heads * size_per_head. Args: input_tensor: float Tensor of shape [batch_size, seq_length, width]. Returns: float Tensor of shape [batch_size, num_attention_heads, seq_length, size_per_head]. """ output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, size_per_head]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor # check the rank from_shape = _mh.get_shape_list(from_tensor, expected_rank=3) to_shape = _mh.get_shape_list(to_tensor, expected_rank=3) if len(from_shape) != len(to_shape) != 3: _error( 'The rank of `from_tensor` should match the rank of `to_tensor`, and should be 3' ) raise ValueError # calculate the query, key, value # from_tensor: [batch_size, seq_length, width] -> query_layer: [batch_size, seq_length, num_attention_heads * size_per_head] # num_attention_heads * size_per_head == hidden_size == width query_layer = tf.layers.dense( from_tensor, num_attention_heads * size_per_head, activation=query_act, name='query', kernel_initializer=_mh.create_initializer(initializer_range)) key_layer = tf.layers.dense( to_tensor, num_attention_heads * size_per_head, activation=key_act, name='key', kernel_initializer=_mh.create_initializer(initializer_range)) value_layer = tf.layers.dense( to_tensor, num_attention_heads * size_per_head, activation=value_act, name='value', kernel_initializer=_mh.create_initializer(initializer_range)) # [batch_size, seq_length, width] -> [batch_size, num_attention_heads, seq_length, size_per_head] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head) key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) # calculate the attention scores # [batch_size, num_attention_heads, from_seq_length, to_seq_length] attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # [batch_size, seq_length, seq_length] -> [batch_size, 1, seq_length, seq_length] attention_mask = tf.expand_dims(attention_mask, axis=1) adder = (1.0 - tf.cast(attention_mask, dtype=tf.float32)) * -10000.0 attention_scores += adder attention_probs = tf.nn.softmax(attention_scores) attention_probs = _mh.dropout(attention_probs, attention_probs_dropout_prob) # calculate the context layer # [batch_size, num_attention_heads, to_seq_length, size_per_head] value_layer = transpose_for_scores(value_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) context_layer = tf.matmul(attention_scores, value_layer) # [batch_size, from_seq_length, num_attention_heads, size_per_head] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) # [batch_size, from_seq_length, width] context_layer = tf.reshape( context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]) return context_layer