def decode_infer(self, inputs, state): # state['enc']: [b * beam, l_s, e] , state['dec']: [b * beam, q', e] # q' = previous decode output length # during infer, following graph are constructed using beam search with self.graph.as_default(): config = self.bert_config target_sequence = inputs['target'] target_length = inputs['target_length'] target_seg_ids = tf.zeros_like(target_sequence, dtype=tf.int32, name='target_seg_ids_infer') tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, q'] with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (tgt_embed, _) = embedding_lookup( input_ids=target_sequence, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. tgt_embed = embedding_postprocessor( input_tensor=tgt_embed, use_token_type=True, token_type_ids=target_seg_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder', reuse=True): # [b, l_t, e] masked_tgt_embed = tgt_embed * tf.expand_dims(tgt_mask, -1) dec_attn_bias = attention_bias(tf.shape(masked_tgt_embed)[1], "causal") decoder_input = tf.pad(masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left infer_decoder_input = decoder_input[:, -1:, :] infer_dec_attn_bias = dec_attn_bias[:, :, -1:, :] all_att_weights, decoder_output, decoder_state = transformer_decoder(infer_decoder_input, self.enc_output, infer_dec_attn_bias, self.enc_attn_bias, self.hps, state=state['decoder'], scope='t_decoder') decoder_output = decoder_output[:, -1, :] # [b * beam, e] logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # [b * beam, v] log_prob = tf.nn.log_softmax(logits) return log_prob, {'encoder': state['encoder'], 'decoder': decoder_state}
def decode_infer_2_bs(self): # beam search version # during second stage decoding, we have a decoded sequence, so do not need to feed state(no incremental dec) # at time i, we calculate i-th attn_bias, get i-th decoder output with self.graph.as_default(): config = self.bert_config target_sequence = tf.reshape(self.decode_seq, [self.hps.eval_batch_size * self.hps.beam_size, -1]) target_length = self.decode_length target_seg_ids = tf.zeros_like(target_sequence, dtype=tf.int32, name='target_seg_ids_infer') tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, q'] with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (tgt_embed, _) = embedding_lookup( input_ids=target_sequence, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. tgt_embed = embedding_postprocessor( input_tensor=tgt_embed, use_token_type=True, token_type_ids=target_seg_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder', reuse=True): # [b, l_t, e] masked_tgt_embed = tgt_embed * tf.expand_dims(tgt_mask, -1) dec_attn_bias = attention_bias(tf.shape(masked_tgt_embed)[1], "cloze_bias") # this operation is necessary as the att bias is shifted infer_decoder_input = tf.pad(masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # This operation is wrong!!! # infer_dec_attn_bias = dec_attn_bias[:, :, self.time_step:self.time_step + 1, :] all_att_weights, decoder_output = transformer_decoder(infer_decoder_input, self.enc_output, dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder') decoder_output = decoder_output[:, self.time_step, :] # [b * beam, e] logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # [b * beam, v] log_prob = tf.nn.log_softmax(logits) return log_prob
def decode_infer_2(self): # stage 2, inference using decoded sequence # l_t = decode sequence length # during infer, following graph are constructed using beam search hidden_size = self.bert_config.hidden_size with self.graph.as_default(): config = self.bert_config target_sequence = self.decode_seq target_length = self.decode_length target_seg_ids = tf.zeros_like(target_sequence, dtype=tf.int32, name='target_seg_ids_infer_2') tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, q'] with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (tgt_embed, _) = embedding_lookup( input_ids=target_sequence, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. tgt_embed = embedding_postprocessor( input_tensor=tgt_embed, use_token_type=True, token_type_ids=target_seg_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder', reuse=True): masked_tgt_embed = tgt_embed * tf.expand_dims(tgt_mask, -1) second_dec_attn_bias = attention_bias(tf.shape(masked_tgt_embed)[1], 'cloze_bias') infer_decoder_input = tf.pad(masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left all_att_weights, decoder_output = transformer_decoder(infer_decoder_input, self.enc_output, second_dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder') # [b, l_t, e] => [b*l_t, v] decoder_output = tf.reshape(decoder_output, [-1, hidden_size]) second_logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # (b*l_t, v) # (b, l_t, v) second_logits = tf.reshape(second_logits, [-1, tf.shape(target_sequence)[1], config.vocab_size]) second_log_prob = tf.nn.log_softmax(second_logits) second_log_id = tf.argmax(second_log_prob, axis=-1) # (b, l_t) return second_log_id
def encode(self): model = modeling.BertModel( config=self.bert_config, is_training=self.is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] self.encoder_output = encoder_output self.enc_attn_bias = attention_bias(self.input_mask, 'masking')
def decode(self): config = self.bert_config hidden_size = self.encoder_output.shape[2].value with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder_1'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1) self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # ################################################### decoding train - 1 self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.all_att_weights, self.decoder_output_1 = transformer_decoder(self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps, scope='decoder_1') # [b, l_t, e] => [b*l_t, v] self.decoder_output_1 = tf.reshape(self.decoder_output_1, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output_1, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] self.logits = self.vocab_probs self.pred_ids = tf.reshape(tf.argmax(self.logits, axis=-1), [self.batch_size, -1])
def decode_infer(self, inputs, state): # state['enc']: [b * beam, l_s, e] , state['dec']: [b * beam, q', e] # q' = previous decode output length # during infer, following graph are constructed using beam search with self.graph.as_default(): config = self.bert_config target_sequence = inputs['target'] # [b * beam, q'] vocab_size = len(self.hps.vocab_out) # trunct word idx, change those greater than vocab_size to unkId shape = target_sequence.shape unkid = self.hps.vocab_out[self.hps.unk] # target_sequence = tf_trunct(target_sequence, vocab_size, self.hps.unkId) target_sequence = tf_trunct(target_sequence, vocab_size, unkid) target_sequence.set_shape(shape) target_length = inputs['target_length'] target_seg_ids = tf.zeros_like(target_sequence, dtype=tf.int32, name='target_seg_ids_infer') tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, q'] # with tf.variable_scope('bert', reuse=True): out_dict_size = len(self.hps.vocab_out) with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (tgt_embed, _) = embedding_lookup( input_ids=target_sequence, vocab_size=out_dict_size, # out vocab size embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. tgt_embed = embedding_postprocessor( input_tensor=tgt_embed, use_token_type=True, token_type_ids=target_seg_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decode', reuse=True): # [b, q', e] masked_tgt_embed = tgt_embed * tf.expand_dims(tgt_mask, -1) dec_attn_bias = attention_bias(tf.shape(masked_tgt_embed)[1], "causal") decoder_input = tf.pad(masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left infer_decoder_input = decoder_input[:, -1:, :] infer_dec_attn_bias = dec_attn_bias[:, :, -1:, :] ret = transformer_decoder(infer_decoder_input, self.enc_output, infer_dec_attn_bias, self.enc_attn_bias, self.hps, state=state['decoder']) all_att_weights, decoder_output, decoder_state = ret decoder_output = decoder_output[:, -1, :] # [b * beam, e] vocab_logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # [b * beam, v] vocab_probs = tf.nn.softmax(vocab_logits) vocab_size = out_dict_size # out vocabsize # we have tiled source_id_oo before feed, so last argument is set to 1 with tf.variable_scope('copy'): logits = calculate_final_logits(decoder_output, all_att_weights, vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, tgt_seq_len=1) log_prob = tf.log(logits) # [b * beam, v + v'] return log_prob, {'encoder': state['encoder'], 'decoder': decoder_state}
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config gpu_pred_ids = [] gpu_logits = [] gpu_train_encoded = [] gpu_loss = [] gpu_out_embed = [] gpu_grads = [] self._add_placeholders() self._n_gpu_split_placeholders(self.hps.n_gpu) for i in range(self.hps.n_gpu): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse): '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids_ngpu[i], input_mask=self.input_mask_ngpu[i], token_type_ids=self.segment_ids_ngpu[i], use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] hidden_size = encoder_output.shape[2].value encoder_out_length = tf.shape(encoder_output)[1] expand_topic_id = tf.expand_dims(self.topic_ids_ngpu[i], -1) topic_input_sequence = tf.tile(expand_topic_id, [1, encoder_out_length]) with tf.variable_scope('topic'): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.topic_embed, self.topic_embeddings) = embedding_lookup( input_ids=topic_input_sequence, # here the embedding input of decoder have to be output_ids vocab_size=self.hps.num_topic, # decode dictionary modified embedding_size=self.hps.topic_embedding_size, initializer_range=config.initializer_range, word_embedding_name='topic_embeddings', use_one_hot_embeddings=False) print('!!!!topic_embeddings', self.topic_embeddings, self.topic_embed) self.encoder_output = tf.concat([encoder_output, self.topic_embed], -1) self.enc_attn_bias = attention_bias(self.input_mask_ngpu[i], 'masking') out_dict_size = len(self.hps.vocab_out) with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids_ngpu[i], # here the embedding input of decoder have to be output_ids vocab_size=out_dict_size, # decode dictionary modified embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids_ngpu[i], token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decode'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask_ngpu[i], -1) self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps) # [b, l_t, e] => [b*l_t, v] self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True) # (b * l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] # vocab_size = len(self.hps.vocab) with tf.variable_scope('copy'): self.single_logits = calculate_final_logits(self.decoder_output, self.all_att_weights, self.vocab_probs, self.input_ids_oo_ngpu[i], self.max_out_oovs, self.input_mask_ngpu[i], out_dict_size, self.tiled_len) # [b * l_t, v + v'] self.single_pred_ids = tf.reshape(tf.argmax(self.single_logits, axis=-1), [self.batch_size, -1]) with tf.variable_scope('loss'): self.single_ce = smooth_cross_entropy( self.single_logits, self.output_label_ngpu[i], self.hps.label_smoothing) self.single_ce = tf.reshape(self.single_ce, tf.shape(self.output_label_ngpu[i])) # [b, l_t] self.single_loss = tf.reduce_sum(self.single_ce * self.output_mask_ngpu[i]) / tf.reduce_sum(self.output_mask_ngpu[i]) # scalar gpu_pred_ids.append(self.single_pred_ids) gpu_logits.append(self.single_logits) gpu_train_encoded.append(self.encoder_output) gpu_loss.append(self.single_loss) gpu_out_embed.append(self.out_embed) params = tf.trainable_variables() grads = tf.gradients(self.single_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) #gpu_ops.append([loss, logits]) self.pred_ids = tf.concat(gpu_pred_ids, axis=0) self.logits = tf.concat(gpu_logits, axis=0) self.loss = tf.reduce_mean(gpu_loss) self.encoder_output = tf.concat(gpu_train_encoded, axis=0) self.out_embed = tf.concat(gpu_out_embed, axis=0) # end for grads = sum_grads(gpu_grads) grads = [g for g, p in grads] self.total_gradient = grads tf.summary.scalar('loss', self.loss)
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config self._add_placeholders() '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] self.encoder_output = encoder_output hidden_size = encoder_output.shape[2].value self.enc_attn_bias = attention_bias(self.input_mask, 'masking') with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids, # here the embedding input of decoder have to be output_ids vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decode'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1) self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps) # [b, l_t, e] => [b*l_t, v] self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True) # (b * l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy'): self.logits = calculate_final_logits(self.decoder_output, self.all_att_weights, self.vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] with tf.variable_scope('loss'): self.ce = smooth_cross_entropy( self.logits, self.output_label, self.hps.label_smoothing) self.ce = tf.reshape(self.ce, tf.shape(self.output_label)) # [b, l_t] self.loss = tf.reduce_sum(self.ce * self.output_mask) / tf.reduce_sum(self.output_mask) # scalar tf.summary.scalar('loss', self.loss)
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config self._add_placeholders() '''Creates a classification model.''' model = modeling.BertModel(config=self.bert_config, is_training=is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.hps.use_tpu ) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] self.encoder_output = encoder_output hidden_size = encoder_output.shape[2].value self.enc_attn_bias = attention_bias(self.input_mask, 'masking') with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder_1'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims( self.output_mask, -1) self.decoder_input = tf.pad( self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # ################################################### decoding train - 1 self.dec_attn_bias = attention_bias( tf.shape(self.masked_out_embed)[1], 'causal') self.all_att_weights, self.decoder_output_1 = transformer_decoder( self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps, scope='decoder_1') # [b, l_t, e] => [b*l_t, v] self.decoder_output_1 = tf.reshape(self.decoder_output_1, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output_1, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): self.logits = calculate_final_logits( self.decoder_output_1, self.all_att_weights, self.vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] self.pred_ids = tf.reshape(tf.argmax(self.logits, axis=-1), [self.batch_size, -1]) # [b, l_t] draft = self.trunct( self.pred_ids ) # as the draft may have copy words, we transform them to UNK first draft = tf.cast(draft, tf.int32) changed_ids = tf.concat([self.output_ids, draft], axis=-1) # [b, 2 * l_t] change_segment_ids = tf.zeros_like(changed_ids, dtype=tf.int32, name='change_segment_ids') def calcu_id_len(input_tensor): step_size = tf.constant(0.001) a = input_tensor res = tf.argmin( tf.cast(a, tf.float32) + tf.cast(tf.range(0, tf.shape(a)[-1]), tf.float32) * step_size, -1) + 1 return res pred_ids_len = calcu_id_len(draft) # [b,] pred_ids_mask_w_draft = tf.sequence_mask(pred_ids_len, maxlen=tf.shape(draft)[1], dtype=tf.float32) # [b, l_t] pred_ids_mask_wo_draft = tf.zeros_like(draft, dtype=tf.float32) pred_ids_mask = tf.cond(self.feed_draft, lambda: pred_ids_mask_w_draft, lambda: pred_ids_mask_wo_draft) change_ids_mask = tf.concat([self.output_mask, pred_ids_mask], axis=-1) # [b, 2 * l_t] transferred_mask = create_attention_mask_from_input_mask( changed_ids, change_ids_mask) # [b, 2 * l_t, 2 * l_t] self.second_dec_attn_bias_w_draft = attention_bias( tf.shape(changed_ids)[1], 'mask_draft') self.second_dec_attn_bias_wo_draft = attention_bias( tf.shape(changed_ids)[1], 'mask_draft_warmup') self.second_dec_attn_bias = tf.cond( self.feed_draft, lambda: self.second_dec_attn_bias_w_draft, lambda: self.second_dec_attn_bias_wo_draft) # [1, 1, 2 * l_t, 2 *l_t] self.second_dec_attn_bias = tf.tile( self.second_dec_attn_bias, [tf.shape(self.output_ids)[0], 1, 1, 1 ]) # [b, 1, 2 * l_t, 2 * l_t] self.second_dec_attn_bias = self.second_dec_attn_bias * tf.expand_dims( transferred_mask, 1) # [b, 1, 2 * l_t, 2 * l_t] dec_model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=changed_ids, input_mask=tf.squeeze(self.second_dec_attn_bias, 1), # [b, 2 * l_t, 2 * l_t] token_type_ids=change_segment_ids, scope='bert', reuse=tf.AUTO_REUSE, use_one_hot_embeddings=self.hps.use_tpu ) # use_one_hot_embeddings=Flags.tpu ? dec_output = dec_model.get_sequence_output() # [b, l_t, h] self.out_embed = dec_output self.masked_out_embed = self.out_embed * tf.expand_dims( change_ids_mask, -1) self.decoder_input = tf.pad( self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # ################################################### decoding train - 2 with tf.variable_scope('decoder_2'): self.all_att_weights, self.decoder_output_2 = transformer_decoder( self.decoder_input, self.encoder_output, (1.0 - self.second_dec_attn_bias) * -1e9, self.enc_attn_bias, self.hps, scope='decoder_2') # [b, 2 * l_t, e] => [b, l_t, e] => [b * l_t, v] target_len = tf.shape(self.output_ids)[1] # keep only ground-truth part for attention weight & decoder output self.all_att_weights[-1] = self.all_att_weights[ -1][:, :target_len, :] # [b, l_t, l_s] self.decoder_output_2 = self.decoder_output_2[:, : target_len, :] # [b, l_t, v] self.decoder_output_2 = tf.reshape(self.decoder_output_2, [-1, hidden_size]) self.second_logits = tf.matmul(self.decoder_output_2, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs_2 = tf.nn.softmax( self.second_logits) # [b * l_t, v] with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): self.second_logits = calculate_final_logits( self.decoder_output_2, self.all_att_weights, self.vocab_probs_2, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] with tf.variable_scope('loss'): self.ce = smooth_cross_entropy(self.logits, self.output_label, self.hps.label_smoothing) self.ce = tf.reshape(self.ce, tf.shape(self.output_label)) # [b, l_t] mle_1 = tf.reduce_sum( self.ce * self.output_mask, -1) / tf.reduce_sum( self.output_mask, -1) # [b] self.first_loss = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum( self.output_mask, -1) self.first_loss = tf.reduce_mean(self.first_loss) # scalar self.second_ce = smooth_cross_entropy(self.second_logits, self.output_label, self.hps.label_smoothing) self.second_ce = tf.reshape(self.second_ce, tf.shape( self.output_label)) # [b, l_t] output_mask = self.output_mask mle_2 = tf.reduce_sum(self.second_ce * output_mask, -1) / ( tf.reduce_sum(output_mask, -1)) # [b] self.second_loss = tf.reduce_mean( tf.reduce_sum(self.second_ce * output_mask, -1) / (tf.reduce_sum(output_mask, -1))) # scalar mle = mle_1 + mle_2 self.rl_loss = tf.reduce_mean(mle * self.reward) # scalar self.ml_loss = self.first_loss + self.second_loss self.loss = self.hps.rl_lambda * self.rl_loss + ( 1 - self.hps.rl_lambda) * self.ml_loss tf.summary.scalar('first_loss', self.first_loss) tf.summary.scalar('second_loss', self.second_loss) tf.summary.scalar('reward', tf.reduce_mean(self.reward)) tf.summary.scalar('rl_loss', self.rl_loss) tf.summary.scalar('ml_loss', self.ml_loss) tf.summary.scalar('loss', self.loss)
def decode_infer_2(self): # stage 2, word level inference using decoded sequence # l_t = decode sequence length # during infer, following graph are constructed using beam search hidden_size = self.bert_config.hidden_size with self.graph.as_default(): target_sequence = tf.squeeze(self.decode_seq, axis=1) draft = self.trunct( target_sequence ) # as the draft may have copy words, we transform them to UNK first target_sequence = self.trunct(target_sequence) target_length = self.decode_length tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, q'] draft = tf.cast(draft, tf.int32) changed_ids = tf.concat([target_sequence, draft], axis=-1) # [b, 2 * l_t] change_segment_ids = tf.zeros_like(changed_ids, dtype=tf.int32, name='change_segment_ids') def calcu_id_len(input_tensor): step_size = tf.constant(0.001) a = input_tensor res = tf.argmin( tf.cast(a, tf.float32) + tf.cast(tf.range(0, tf.shape(a)[-1]), tf.float32) * step_size, -1) + 1 return res pred_ids_len = calcu_id_len(draft) # [b,] pred_ids_mask_w_draft = tf.sequence_mask( pred_ids_len, maxlen=tf.shape(draft)[1], dtype=tf.float32) # [b, l_t] pred_ids_mask = pred_ids_mask_w_draft change_ids_mask = tf.concat([tgt_mask, pred_ids_mask], axis=-1) # [b, 2 * l_t] transferred_mask = create_attention_mask_from_input_mask( changed_ids, change_ids_mask) # [b, 2 * l_t, 2 * l_t] second_dec_attn_bias_w_draft = attention_bias( tf.shape(changed_ids)[1], 'mask_draft') second_dec_attn_bias = second_dec_attn_bias_w_draft # [1, 1, 2 * l_t, 2 *l_t] second_dec_attn_bias = tf.tile( second_dec_attn_bias, [tf.shape(target_sequence)[0], 1, 1, 1 ]) # [b, 1, 2 * l_t, 2 * l_t] second_dec_attn_bias = second_dec_attn_bias * tf.expand_dims( transferred_mask, 1) # [b, 1, 2 * l_t, 2 * l_t] is_training = self.is_training dec_model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=changed_ids, input_mask=tf.squeeze(second_dec_attn_bias, 1), # [b, 2 * l_t, 2 * l_t] token_type_ids=change_segment_ids, scope='bert', reuse=tf.AUTO_REUSE, use_one_hot_embeddings=self.hps.use_tpu) dec_output = dec_model.get_sequence_output() # [b, l_t, h] tgt_embed = dec_output with tf.variable_scope('decoder_2', reuse=True): masked_tgt_embed = tgt_embed * tf.expand_dims( change_ids_mask, -1) infer_decoder_input = tf.pad( masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left all_att_weights, decoder_output = transformer_decoder( infer_decoder_input, self.enc_output, (1.0 - second_dec_attn_bias) * -1e9, self.enc_attn_bias, self.hps, scope='decoder_2') # [b, l_t, e] => [b*l_t, v] target_len = tf.shape(target_sequence)[1] # keep only ground-truth part for attention weight & decoder output all_att_weights[-1] = all_att_weights[ -1][:, :target_len, :] # [b, l_t, l_s] decoder_output = decoder_output[:, : target_len, :] # [b, l_t, v] decoder_output = tf.reshape(decoder_output, [-1, hidden_size]) second_logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # (b*l_t, v) vocab_probs = tf.nn.softmax(second_logits) # [b * l_t, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): logits = calculate_final_logits( decoder_output, all_att_weights, vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.infer_tiled_len) # [b * l_t, v + v'] second_log_prob = tf.log(logits) # (b, l_t, v) extend_vocab_size = tf.add(tf.constant(vocab_size), self.max_out_oovs) second_log_prob = tf.reshape( second_log_prob, [-1, tf.shape(target_sequence)[1], extend_vocab_size]) second_log_id = tf.argmax(second_log_prob, axis=-1) # (b, l_t) return second_log_id
def decode_infer_2_bs(self): # beam search version # during second stage decoding, we have a decoded sequence, so do not need to feed state(no incremental dec) # at time i, we calculate i-th attn_bias, get i-th decoder output with self.graph.as_default(): target_sequence = tf.reshape( self.decode_seq, [self.hps.eval_batch_size * self.hps.beam_size, -1]) draft = self.trunct( target_sequence ) # as the draft may have copy words, we transform them to UNK first target_sequence = self.trunct(target_sequence) target_length = self.decode_length tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, l_t] draft = tf.cast(draft, tf.int32) changed_ids = tf.concat([target_sequence, draft], axis=-1) # [b, 2 * l_t] change_segment_ids = tf.zeros_like(changed_ids, dtype=tf.int32, name='change_segment_ids') def calcu_id_len(input_tensor): step_size = tf.constant(0.001) a = input_tensor res = tf.argmin( tf.cast(a, tf.float32) + tf.cast(tf.range(0, tf.shape(a)[-1]), tf.float32) * step_size, -1) + 1 return res pred_ids_len = calcu_id_len(draft) # [b,] pred_ids_mask_w_draft = tf.sequence_mask( pred_ids_len, maxlen=tf.shape(draft)[1], dtype=tf.float32) # [b, l_t] pred_ids_mask = pred_ids_mask_w_draft change_ids_mask = tf.concat([tgt_mask, pred_ids_mask], axis=-1) # [b, 2 * l_t] transferred_mask = create_attention_mask_from_input_mask( changed_ids, change_ids_mask) # [b, 2 * l_t, 2 * l_t] second_dec_attn_bias_w_draft = attention_bias( tf.shape(changed_ids)[1], 'mask_draft') second_dec_attn_bias = second_dec_attn_bias_w_draft # [1, 1, 2 * l_t, 2 *l_t] second_dec_attn_bias = tf.tile( second_dec_attn_bias, [tf.shape(target_sequence)[0], 1, 1, 1 ]) # [b, 1, 2 * l_t, 2 * l_t] second_dec_attn_bias = second_dec_attn_bias * tf.expand_dims( transferred_mask, 1) # [b, 1, 2 * l_t, 2 * l_t] is_training = self.is_training dec_model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=changed_ids, input_mask=tf.squeeze(second_dec_attn_bias, 1), # [b, 2 * l_t, 2 * l_t] token_type_ids=change_segment_ids, scope='bert', reuse=tf.AUTO_REUSE, use_one_hot_embeddings=self.hps.use_tpu) dec_output = dec_model.get_sequence_output() # [b, l_t, h] tgt_embed = dec_output with tf.variable_scope('decoder_2', reuse=True): # [b, l_t, e] masked_tgt_embed = tgt_embed * tf.expand_dims( change_ids_mask, -1) infer_decoder_input = tf.pad( masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left all_att_weights, decoder_output = transformer_decoder( infer_decoder_input, self.enc_output, (1.0 - second_dec_attn_bias) * -1e9, self.enc_attn_bias, self.hps, scope='decoder_2') decoder_output = decoder_output[:, self. time_step, :] # [b * beam, e] all_att_weights[-1] = all_att_weights[-1][:, self.time_step, :] second_logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # (b*beam, v) vocab_probs = tf.nn.softmax(second_logits) # [b * beam, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): logits = calculate_final_logits( decoder_output, all_att_weights, vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, 1) # [b * beam, v + v'] second_log_prob = tf.log(logits) return second_log_prob
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config self._add_placeholders() '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, scope='bert', use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] self.encoder_output = encoder_output hidden_size = encoder_output.shape[2].value self.enc_attn_bias = attention_bias(self.input_mask, 'masking') with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) '''Creates a lm model.''' lm_model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.lm_output_ids, input_mask=self.lm_output_mask, token_type_ids=self.lm_out_segment_ids, use_one_hot_embeddings=self.hps.use_tpu, # use_one_hot_embeddings=Flags.tpu ? scope='bert', reuse=True, on_cpu=True, use_lm=True, lm_position=self.lm_position) with tf.variable_scope('decoder'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1) self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder') # [b, l_t, e] => [b*l_t, v] self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size]) self.logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True) # (b*l_t, v) self.second_dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'cloze_bias') self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input, self.encoder_output, self.second_dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder', reuse=True) # [b, l_t, e] => [b*l_t, v] self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size]) self.second_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True) # (b*l_t, v) self.lm_logits = lm_model.get_lm_output() # (b*l_t, v) self.lm_logits = tf.stop_gradient(self.lm_logits) # use pooled output to represent the original input sequence self.pooled_output = model.get_pooled_output() # (b, e) self.article_representation = tf.reshape(tf.tile(tf.expand_dims(self.pooled_output, 1), [1, tf.shape(self.lm_output_ids)[1], 1]), [-1, self.bert_config.hidden_size]) # (b * l_t, e) self.masked_summary_representation = lm_model.get_pooled_output() # (b * l_t, e) self.concated_representation = tf.concat([self.article_representation, self.masked_summary_representation], axis=-1) # (b * l_t, 2e) self.lm_prob = tf.nn.sigmoid(linear(self.concated_representation, 1)) # (b * l_t, 1) self.final_second_logits = self.lm_prob * self.second_logits + (1 - self.lm_prob) * self.lm_logits with tf.variable_scope('loss'): self.ce = smoothed_softmax_cross_entropy( self.logits, self.output_ids, self.hps.label_smoothing, True ) self.ce = tf.reshape(self.ce, tf.shape(self.output_ids)) # [b, l_t] self.first_loss = tf.reduce_sum(self.ce * self.output_mask) / tf.reduce_sum(self.output_mask) # scalar self.second_ce = smoothed_softmax_cross_entropy( self.final_second_logits, self.output_ids, self.hps.label_smoothing, True ) self.second_ce = tf.reshape(self.second_ce, tf.shape(self.output_ids)) # [b, l_t] self.second_loss = tf.reduce_sum(self.second_ce * self.output_mask) / tf.reduce_sum( self.output_mask) # scalar self.loss = self.first_loss + self.second_loss tf.summary.scalar('first_loss', self.first_loss) tf.summary.scalar('second_loss', self.second_loss) tf.summary.scalar('loss', self.loss)
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config self._add_placeholders() '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? batch_size = tf.shape(self.output_ids)[0] tgt_len = tf.shape(self.output_ids)[1] rnd_value = tf.random_uniform([batch_size, tgt_len], minval=0, maxval=1) replace_to_mask = rnd_value < 0.1 replace_to_random_word = (rnd_value > 0.1) & (rnd_value < 0.15) keep_the_word = rnd_value < 0.2 keep_the_word = tf.cast(keep_the_word, tf.float32) all_mask = tf.ones_like(self.output_ids, dtype=tf.int32) mask_id = self.hps.maskId all_mask = all_mask * mask_id all_random_word_id = tf.random_uniform([batch_size, tgt_len], minval=999, maxval=30521, dtype=tf.int32) changed_ids = self.output_ids changed_ids = tf.where(replace_to_mask, all_mask, changed_ids) changed_ids = tf.where(replace_to_random_word, all_random_word_id, changed_ids) dec_model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=changed_ids, input_mask=self.output_mask, token_type_ids=self.out_segment_ids, scope='bert', reuse=tf.AUTO_REUSE, use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] dec_output = dec_model.get_sequence_output() # [b, l_t, h] self.encoder_output = encoder_output hidden_size = encoder_output.shape[2].value self.enc_attn_bias = attention_bias(self.input_mask, 'masking') with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1) self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # ################################################### decoding train - 1 self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.all_att_weights, self.decoder_output_1 = transformer_decoder(self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder') # [b, l_t, e] => [b*l_t, v] self.decoder_output_1 = tf.reshape(self.decoder_output_1, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output_1, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): self.logits = calculate_final_logits(self.decoder_output_1, self.all_att_weights, self.vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] self.pred_ids = tf.reshape(tf.argmax(self.logits, axis=-1), [self.batch_size, -1]) # [b, l_t] self.out_embed = dec_output self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1) self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # ################################################### decoding train - 2 self.second_dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'cloze_bias') self.all_att_weights, self.decoder_output_2 = transformer_decoder(self.decoder_input, self.encoder_output, self.second_dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder', reuse=True) # [b, l_t, e] => [b*l_t, v] self.decoder_output_2 = tf.reshape(self.decoder_output_2, [-1, hidden_size]) self.second_logits = tf.matmul(self.decoder_output_2, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs_2 = tf.nn.softmax(self.second_logits) # [b * l_t, v] with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): self.second_logits = calculate_final_logits(self.decoder_output_2, self.all_att_weights, self.vocab_probs_2, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] # ################################################### decoding train - 3 # self.all_att_weights, self.decoder_output_3 = transformer_decoder(self.decoder_input, self.encoder_output, # self.sent_level_attn_bias, # self.enc_attn_bias, # self.hps, scope='t_decoder', reuse=True) # # [b, l_t, e] => [b*l_t, v] # self.decoder_output_3 = tf.reshape(self.decoder_output_3, [-1, hidden_size]) # self.third_logits = tf.matmul(self.decoder_output_3, self.decoder_weights, False, True) # (b*l_t, v) # self.vocab_probs_3 = tf.nn.softmax(self.third_logits) # [b * l_t, v] # with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): # self.third_logits = calculate_final_logits(self.decoder_output_3, self.all_att_weights, # self.vocab_probs_3, # self.input_ids_oo, self.max_out_oovs, self.input_mask, # vocab_size, # self.tiled_len) # [b * l_t, v + v'] with tf.variable_scope('loss'): self.ce = smooth_cross_entropy( self.logits, self.output_label, self.hps.label_smoothing) self.ce = tf.reshape(self.ce, tf.shape(self.output_label)) # [b, l_t] mle_1 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1) # [b] self.first_loss = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1) self.first_loss = tf.reduce_mean(self.first_loss) # scalar self.second_ce = smooth_cross_entropy( self.second_logits, self.output_label, self.hps.label_smoothing) self.second_ce = tf.reshape(self.second_ce, tf.shape(self.output_label)) # [b, l_t] output_mask = self.output_mask * keep_the_word mle_2 = tf.reduce_sum(self.second_ce * output_mask, -1) / (tf.reduce_sum(output_mask, -1) + 0.001) # [b] self.second_loss = tf.reduce_mean(tf.reduce_sum(self.second_ce * output_mask, -1) / (tf.reduce_sum( output_mask, -1) + 0.001)) # scalar # self.ce = smooth_cross_entropy( # self.third_logits, # self.output_ids, # self.hps.label_smoothing) # # self.ce = tf.reshape(self.ce, tf.shape(self.output_label)) # [b, l_t] # # mle_3 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1) # [b] # # self.third_loss = tf.reduce_mean(tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum( # self.output_mask, -1)) # scalar mle = mle_1 + mle_2 self.rl_loss = tf.reduce_mean(mle * self.reward) # scalar self.ml_loss = self.first_loss + self.second_loss self.loss = self.hps.rl_lambda * self.rl_loss + (1 - self.hps.rl_lambda) * self.ml_loss tf.summary.scalar('first_loss', self.first_loss) tf.summary.scalar('second_loss', self.second_loss) # tf.summary.scalar('third_loss', self.third_loss) tf.summary.scalar('reward', tf.reduce_mean(self.reward)) tf.summary.scalar('rl_loss', self.rl_loss) tf.summary.scalar('ml_loss', self.ml_loss) tf.summary.scalar('loss', self.loss)
def decode_infer_2(self): # stage 2, word level inference using decoded sequence # l_t = decode sequence length # during infer, following graph are constructed using beam search hidden_size = self.bert_config.hidden_size with self.graph.as_default(): target_sequence = tf.squeeze(self.decode_seq, axis=1) target_sequence = self.trunct(target_sequence) target_length = self.decode_length target_seg_ids = tf.zeros_like(target_sequence, dtype=tf.int32, name='target_seg_ids_infer_2') tgt_mask = tf.sequence_mask(target_length, maxlen=tf.shape(target_sequence)[1], dtype=tf.float32) # [b, q'] is_training = self.is_training dec_model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=target_sequence, input_mask=tgt_mask, token_type_ids=target_seg_ids, scope='bert', reuse=tf.AUTO_REUSE, use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? dec_output = dec_model.get_sequence_output() # [b, l_t, h] tgt_embed = dec_output # with tf.variable_scope('bert', reuse=True): # with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # # Perform embedding lookup on the target word ids. # (tgt_embed, _) = embedding_lookup( # input_ids=target_sequence, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name='word_embeddings', # use_one_hot_embeddings=False) # # # Add positional embeddings and token type embeddings, then layer # # normalize and perform dropout. # tgt_embed = embedding_postprocessor( # input_tensor=tgt_embed, # use_token_type=True, # token_type_ids=target_seg_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name='token_type_embeddings', # use_position_embeddings=True, # position_embedding_name='position_embeddings', # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder', reuse=True): masked_tgt_embed = tgt_embed * tf.expand_dims(tgt_mask, -1) second_dec_attn_bias = attention_bias(tf.shape(masked_tgt_embed)[1], 'cloze_bias') infer_decoder_input = tf.pad(masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left all_att_weights, decoder_output = transformer_decoder(infer_decoder_input, self.enc_output, second_dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder') # [b, l_t, e] => [b*l_t, v] decoder_output = tf.reshape(decoder_output, [-1, hidden_size]) second_logits = tf.matmul(decoder_output, self.decoder_weights, False, True) # (b*l_t, v) vocab_probs = tf.nn.softmax(second_logits) # [b * l_t, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): logits = calculate_final_logits(decoder_output, all_att_weights, vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.infer_tiled_len) # [b * l_t, v + v'] second_log_prob = tf.log(logits) # (b, l_t, v) extend_vocab_size = tf.add(tf.constant(vocab_size), self.max_out_oovs) second_log_prob = tf.reshape(second_log_prob, [-1, tf.shape(target_sequence)[1], extend_vocab_size]) second_log_id = tf.argmax(second_log_prob, axis=-1) # (b, l_t) return second_log_id
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config self._add_placeholders() '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] self.sentence_rep = tf.expand_dims(model.get_pooled_output(), axis=1) # [b, 1, h] self.encoder_output = encoder_output hidden_size = encoder_output.shape[2].value self.enc_attn_bias = attention_bias(self.input_mask, 'masking') with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decoder'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1) self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # ################################################### decoding train - 1 self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.all_att_weights, self.decoder_output_1 = transformer_decoder(self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder') sentence_rep = tf.tile(self.sentence_rep, [1, tf.shape(self.decoder_output_1)[1], 1]) # [b, l_t, e] # [b, l_t, e] => [b*l_t, v] copy_rep_1 = tf.concat([sentence_rep, self.decoder_output_1], axis=-1) # [b, l_t, 2 * e] self.decoder_output_1 = tf.reshape(self.decoder_output_1, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output_1, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): copy_rep_1 = tf.reshape(copy_rep_1, [-1, hidden_size * 2]) self.logits = calculate_final_logits(copy_rep_1, self.all_att_weights, self.vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] self.pred_ids = tf.reshape(tf.argmax(self.logits, axis=-1), [self.batch_size, -1]) # [b, l_t] # ################################################### decoding train - 2 self.second_dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'cloze_bias') self.all_att_weights, self.decoder_output_2 = transformer_decoder(self.decoder_input, self.encoder_output, self.second_dec_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder', reuse=True) # [b, l_t, e] => [b*l_t, v] copy_rep_2 = tf.concat([sentence_rep, self.decoder_output_2], axis=-1) self.decoder_output_2 = tf.reshape(self.decoder_output_2, [-1, hidden_size]) self.second_logits = tf.matmul(self.decoder_output_2, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs_2 = tf.nn.softmax(self.second_logits) # [b * l_t, v] with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): copy_rep_2 = tf.reshape(copy_rep_2, [-1, hidden_size * 2]) self.second_logits = calculate_final_logits(copy_rep_2, self.all_att_weights, self.vocab_probs_2, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] # ################################################### decoding train - 3 self.all_att_weights, self.decoder_output_3 = transformer_decoder(self.decoder_input, self.encoder_output, self.sent_level_attn_bias, self.enc_attn_bias, self.hps, scope='t_decoder', reuse=True) # [b, l_t, e] => [b*l_t, v] copy_rep_3 = tf.concat([sentence_rep, self.decoder_output_3], axis=-1) self.decoder_output_3 = tf.reshape(self.decoder_output_3, [-1, hidden_size]) self.third_logits = tf.matmul(self.decoder_output_3, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs_3 = tf.nn.softmax(self.third_logits) # [b * l_t, v] with tf.variable_scope('copy', reuse=tf.AUTO_REUSE): copy_rep_3 = tf.reshape(copy_rep_3, [-1, hidden_size * 2]) self.third_logits = calculate_final_logits(copy_rep_3, self.all_att_weights, self.vocab_probs_3, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] with tf.variable_scope('loss'): self.ce = smooth_cross_entropy( self.logits, self.output_label, self.hps.label_smoothing) self.ce = tf.reshape(self.ce, tf.shape(self.output_label)) # [b, l_t] mle_1 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1) # [b] self.first_loss = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1) self.first_loss = tf.reduce_mean(self.first_loss) # scalar self.second_ce = smooth_cross_entropy( self.second_logits, self.output_label, self.hps.label_smoothing) self.second_ce = tf.reshape(self.second_ce, tf.shape(self.output_label)) # [b, l_t] mle_2 = tf.reduce_sum(self.second_ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1) # [b] self.second_loss = tf.reduce_mean(tf.reduce_sum(self.second_ce * self.output_mask, -1) / tf.reduce_sum( self.output_mask, -1)) # scalar self.ce = smooth_cross_entropy( self.third_logits, self.output_ids, self.hps.label_smoothing) self.ce = tf.reshape(self.ce, tf.shape(self.output_label)) # [b, l_t] mle_3 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1) # [b] self.third_loss = tf.reduce_mean(tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum( self.output_mask, -1)) # scalar mle = mle_1 + mle_2 + mle_3 self.rl_loss = tf.reduce_mean(mle * self.reward) # scalar self.ml_loss = self.first_loss + self.second_loss + self.third_loss self.loss = self.hps.rl_lambda * self.rl_loss + (1 - self.hps.rl_lambda) * self.ml_loss tf.summary.scalar('first_loss', self.first_loss) tf.summary.scalar('second_loss', self.second_loss) tf.summary.scalar('third_loss', self.third_loss) tf.summary.scalar('reward', tf.reduce_mean(self.reward)) tf.summary.scalar('rl_loss', self.rl_loss) tf.summary.scalar('ml_loss', self.ml_loss) tf.summary.scalar('loss', self.loss)
def decode_2(self): config = self.bert_config hidden_size = self.encoder_output.shape[2].value draft = self.trunct(self.pred_ids) # as the draft may have copy words, we transform them to UNK first draft = tf.cast(draft, tf.int32) changed_ids = tf.concat([self.output_ids, draft], axis=-1) # [b, 2 * l_t] change_segment_ids = tf.zeros_like(changed_ids, dtype=tf.int32, name='change_segment_ids') def calcu_id_len(input_tensor): step_size = tf.constant(0.001) a = input_tensor res = tf.argmin(tf.cast(a, tf.float32) + tf.cast(tf.range(0, tf.shape(a)[-1]), tf.float32) * step_size, -1) + 1 return res pred_ids_len = calcu_id_len(draft) # [b,] pred_ids_mask_w_draft = tf.sequence_mask(pred_ids_len, maxlen=tf.shape(draft)[1], dtype=tf.float32) # [b, l_t] pred_ids_mask_wo_draft = tf.zeros_like(draft, dtype=tf.float32) pred_ids_mask = tf.cond(self.feed_draft, lambda: pred_ids_mask_w_draft, lambda: pred_ids_mask_wo_draft) change_ids_mask = tf.concat([self.output_mask, pred_ids_mask], axis=-1) # [b, 2 * l_t] transferred_mask = create_attention_mask_from_input_mask(changed_ids, change_ids_mask) # [b, 2 * l_t, 2 * l_t] self.second_dec_attn_bias_w_draft = attention_bias(tf.shape(changed_ids)[1], 'mask_draft') self.second_dec_attn_bias_wo_draft = attention_bias(tf.shape(changed_ids)[1], 'mask_draft_warmup') self.second_dec_attn_bias = tf.cond(self.feed_draft, lambda: self.second_dec_attn_bias_w_draft, lambda: self.second_dec_attn_bias_wo_draft) # [1, 1, 2 * l_t, 2 *l_t] self.second_dec_attn_bias = tf.tile(self.second_dec_attn_bias, [tf.shape(self.output_ids)[0], 1, 1, 1]) # [b, 1, 2 * l_t, 2 * l_t] self.second_dec_attn_bias = self.second_dec_attn_bias * tf.expand_dims(transferred_mask, 1) # [b, 1, 2 * l_t, 2 * l_t] with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (out_embed, bert_embeddings) = embedding_lookup( input_ids=changed_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. out_embed = embedding_postprocessor( input_tensor=out_embed, use_token_type=True, token_type_ids=change_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) masked_out_embed = out_embed * tf.expand_dims(change_ids_mask, -1) self.decoder_input = tf.pad(masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left # ################################################### decoding train - 2 with tf.variable_scope('decoder_2'): self.all_att_weights, self.decoder_output_2 = transformer_decoder(self.decoder_input, self.encoder_output, (1.0 - self.second_dec_attn_bias) * -1e9, self.enc_attn_bias, self.hps, scope='decoder_2') # [b, 2 * l_t, e] => [b, l_t, e] => [b * l_t, v] target_len = tf.shape(self.output_ids)[1] # keep only ground-truth part for attention weight & decoder output self.all_att_weights[-1] = self.all_att_weights[-1][:, :target_len, :] # [b, l_t, l_s] self.decoder_output_2 = self.decoder_output_2[:, :target_len, :] # [b, l_t, v] self.decoder_output_2 = tf.reshape(self.decoder_output_2, [-1, hidden_size]) self.second_logits = tf.matmul(self.decoder_output_2, self.decoder_weights, False, True) # (b*l_t, v) self.vocab_probs_2 = tf.nn.softmax(self.second_logits) # [b * l_t, v] self.second_logits = self.vocab_probs_2