def _build_summarization_model(self): is_training = self.is_training config = self.bert_config gpu_pred_ids = [] gpu_logits = [] gpu_train_encoded = [] gpu_loss = [] gpu_out_embed = [] gpu_grads = [] self._add_placeholders() self._n_gpu_split_placeholders(self.hps.n_gpu) for i in range(self.hps.n_gpu): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids_ngpu[i], input_mask=self.input_mask_ngpu[i], token_type_ids=self.segment_ids_ngpu[i], use_one_hot_embeddings=self.hps.use_tpu ) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] self.enc_attn_bias = attention_bias(self.input_mask_ngpu[i], 'masking') hidden_size = encoder_output.shape[2].value encoder_out_length = tf.shape(encoder_output)[1] """Get topic word memory""" out_dict_size = len(self.hps.vocab_out) ## for topic word memory with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (topic_word_memory, _) = embedding_lookup( input_ids=self.topic_words_ids_ngpu[ i], # here the embedding input of decoder have to be output_ids vocab_size= out_dict_size, # decode dictionary modified embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.topic_word_memory = embedding_postprocessor( input_tensor=topic_word_memory, use_token_type=True, token_type_ids=self.mem_segment_ids_ngpu[i], token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=False, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config. max_position_embeddings, dropout_prob=config.hidden_dropout_prob) self.topic_attn_bias = attention_bias( self.topic_words_mask_ngpu[i], 'masking') #print('topic_word_memory!!!!', self.topic_word_memory) #print('encoder_output_topic_emb!!!!', encoder_output_topic_emb) #print('self.topic_attn_bias!!!!', self.topic_attn_bias) #print('self.enc_attn_bias!!!!', self.enc_attn_bias) """encoder_topic_attention""" with tf.variable_scope("encoder_topic_attention"): params = self.hps y = multihead_attention( layer_process(encoder_output, params.layer_preprocess), self.topic_word_memory, self.topic_attn_bias, params.num_heads, params.attention_key_channels or params.hidden_size, params.attention_value_channels or params.hidden_size, params.hidden_size, params.attention_dropout) self.encoder_output = y["outputs"] """decoder""" with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. ( self.out_embed, self.bert_embeddings ) = embedding_lookup( input_ids=self.output_ids_ngpu[ i], # here the embedding input of decoder have to be output_ids vocab_size= out_dict_size, # decode dictionary modified embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids_ngpu[i], token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config. max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decode'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims( self.output_mask_ngpu[i], -1) self.dec_attn_bias = attention_bias( tf.shape(self.masked_out_embed)[1], 'causal') self.decoder_input = tf.pad( self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left self.all_att_weights1, self.all_att_weights2, self.decoder_output = transformer_decoder_three( self.decoder_input, self.encoder_output, self.topic_word_memory, self.dec_attn_bias, self.enc_attn_bias, self.topic_attn_bias, self.hps) # [b, l_t, e] => [b*l_t, v] self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True) # (b * l_t, v) self.vocab_probs = tf.nn.softmax( self.vocab_logits) # [b * l_t, v] # vocab_size = len(self.hps.vocab) with tf.variable_scope('copy'): self.single_logits = calculate_two_copy_logits( self.decoder_output, self.all_att_weights1, self.vocab_probs, self.input_ids_oo_ngpu[i], self.max_out_oovs, self.input_mask_ngpu[i], out_dict_size, self.tiled_len, self.all_att_weights2, self.topic_words_ids_ngpu[i], self.topic_words_mask_ngpu[i]) # [b * l_t, v + v'] self.single_pred_ids = tf.reshape( tf.argmax(self.single_logits, axis=-1), [self.batch_size, -1]) with tf.variable_scope('loss'): self.single_ce = smooth_cross_entropy( self.single_logits, self.output_label_ngpu[i], self.hps.label_smoothing) self.single_ce = tf.reshape( self.single_ce, tf.shape(self.output_label_ngpu[i])) # [b, l_t] self.single_loss = tf.reduce_sum( self.single_ce * self.output_mask_ngpu[i]) / tf.reduce_sum( self.output_mask_ngpu[i]) # scalar gpu_pred_ids.append(self.single_pred_ids) gpu_logits.append(self.single_logits) gpu_train_encoded.append(self.encoder_output) gpu_loss.append(self.single_loss) gpu_out_embed.append(self.out_embed) params = tf.trainable_variables() grads = tf.gradients(self.single_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) #gpu_ops.append([loss, logits]) self.pred_ids = tf.concat(gpu_pred_ids, axis=0) self.logits = tf.concat(gpu_logits, axis=0) self.loss = tf.reduce_mean(gpu_loss) self.encoder_output = tf.concat(gpu_train_encoded, axis=0) self.out_embed = tf.concat(gpu_out_embed, axis=0) # end for grads = sum_grads(gpu_grads) grads = [g for g, p in grads] self.total_gradient = grads tf.summary.scalar('loss', self.loss)
def _build_summarization_model(self): is_training = self.is_training config = self.bert_config self._add_placeholders() '''Creates a classification model.''' model = modeling.BertModel( config=self.bert_config, is_training=is_training, input_ids=self.input_ids, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=self.hps.use_tpu) # use_one_hot_embeddings=Flags.tpu ? encoder_output = model.get_sequence_output() # [b, l_s, h] self.encoder_output = encoder_output hidden_size = encoder_output.shape[2].value self.enc_attn_bias = attention_bias(self.input_mask, 'masking') with tf.variable_scope('bert', reuse=True): with tf.variable_scope('embeddings'), tf.device('/cpu:0'): # Perform embedding lookup on the target word ids. (self.out_embed, self.bert_embeddings) = embedding_lookup( input_ids=self.output_ids, # here the embedding input of decoder have to be output_ids vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.out_embed = embedding_postprocessor( input_tensor=self.out_embed, use_token_type=True, token_type_ids=self.out_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope('decode'): self.decoder_weights = self.bert_embeddings self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1) self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal') self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # Shift left self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input, self.encoder_output, self.dec_attn_bias, self.enc_attn_bias, self.hps) # [b, l_t, e] => [b*l_t, v] self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size]) self.vocab_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True) # (b * l_t, v) self.vocab_probs = tf.nn.softmax(self.vocab_logits) # [b * l_t, v] vocab_size = len(self.hps.vocab) with tf.variable_scope('copy'): self.logits = calculate_final_logits(self.decoder_output, self.all_att_weights, self.vocab_probs, self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size, self.tiled_len) # [b * l_t, v + v'] with tf.variable_scope('loss'): self.ce = smooth_cross_entropy( self.logits, self.output_label, self.hps.label_smoothing) self.ce = tf.reshape(self.ce, tf.shape(self.output_label)) # [b, l_t] self.loss = tf.reduce_sum(self.ce * self.output_mask) / tf.reduce_sum(self.output_mask) # scalar tf.summary.scalar('loss', self.loss)