Exemplo n.º 1
0
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        gpu_pred_ids = []
        gpu_logits = []
        gpu_train_encoded = []
        gpu_loss = []
        gpu_out_embed = []
        gpu_grads = []
        self._add_placeholders()
        self._n_gpu_split_placeholders(self.hps.n_gpu)

        for i in range(self.hps.n_gpu):
            do_reuse = True if i > 0 else None
            with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(
                    tf.get_variable_scope(), reuse=do_reuse):
                '''Creates a classification model.'''
                model = modeling.BertModel(
                    config=self.bert_config,
                    is_training=is_training,
                    input_ids=self.input_ids_ngpu[i],
                    input_mask=self.input_mask_ngpu[i],
                    token_type_ids=self.segment_ids_ngpu[i],
                    use_one_hot_embeddings=self.hps.use_tpu
                )  # use_one_hot_embeddings=Flags.tpu ?
                encoder_output = model.get_sequence_output()  # [b, l_s, h]
                self.enc_attn_bias = attention_bias(self.input_mask_ngpu[i],
                                                    'masking')

                hidden_size = encoder_output.shape[2].value
                encoder_out_length = tf.shape(encoder_output)[1]
                """Get topic word memory"""
                out_dict_size = len(self.hps.vocab_out)
                ## for topic word memory
                with tf.variable_scope('bert', reuse=True):
                    with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                        # Perform embedding lookup on the target word ids.
                        (topic_word_memory, _) = embedding_lookup(
                            input_ids=self.topic_words_ids_ngpu[
                                i],  # here the embedding input of decoder have to be output_ids
                            vocab_size=
                            out_dict_size,  # decode dictionary modified
                            embedding_size=config.hidden_size,
                            initializer_range=config.initializer_range,
                            word_embedding_name='word_embeddings',
                            use_one_hot_embeddings=False)
                        # Add positional embeddings and token type embeddings, then layer
                        # normalize and perform dropout.
                        self.topic_word_memory = embedding_postprocessor(
                            input_tensor=topic_word_memory,
                            use_token_type=True,
                            token_type_ids=self.mem_segment_ids_ngpu[i],
                            token_type_vocab_size=config.type_vocab_size,
                            token_type_embedding_name='token_type_embeddings',
                            use_position_embeddings=False,
                            position_embedding_name='position_embeddings',
                            initializer_range=config.initializer_range,
                            max_position_embeddings=config.
                            max_position_embeddings,
                            dropout_prob=config.hidden_dropout_prob)
                self.topic_attn_bias = attention_bias(
                    self.topic_words_mask_ngpu[i], 'masking')

                #print('topic_word_memory!!!!', self.topic_word_memory)
                #print('encoder_output_topic_emb!!!!', encoder_output_topic_emb)
                #print('self.topic_attn_bias!!!!', self.topic_attn_bias)
                #print('self.enc_attn_bias!!!!', self.enc_attn_bias)
                """encoder_topic_attention"""
                with tf.variable_scope("encoder_topic_attention"):
                    params = self.hps
                    y = multihead_attention(
                        layer_process(encoder_output, params.layer_preprocess),
                        self.topic_word_memory, self.topic_attn_bias,
                        params.num_heads, params.attention_key_channels
                        or params.hidden_size, params.attention_value_channels
                        or params.hidden_size, params.hidden_size,
                        params.attention_dropout)
                self.encoder_output = y["outputs"]
                """decoder"""
                with tf.variable_scope('bert', reuse=True):
                    with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                        # Perform embedding lookup on the target word ids.
                        (
                            self.out_embed, self.bert_embeddings
                        ) = embedding_lookup(
                            input_ids=self.output_ids_ngpu[
                                i],  # here the embedding input of decoder have to be output_ids
                            vocab_size=
                            out_dict_size,  # decode dictionary modified
                            embedding_size=config.hidden_size,
                            initializer_range=config.initializer_range,
                            word_embedding_name='word_embeddings',
                            use_one_hot_embeddings=False)

                        # Add positional embeddings and token type embeddings, then layer
                        # normalize and perform dropout.
                        self.out_embed = embedding_postprocessor(
                            input_tensor=self.out_embed,
                            use_token_type=True,
                            token_type_ids=self.out_segment_ids_ngpu[i],
                            token_type_vocab_size=config.type_vocab_size,
                            token_type_embedding_name='token_type_embeddings',
                            use_position_embeddings=True,
                            position_embedding_name='position_embeddings',
                            initializer_range=config.initializer_range,
                            max_position_embeddings=config.
                            max_position_embeddings,
                            dropout_prob=config.hidden_dropout_prob)

                with tf.variable_scope('decode'):
                    self.decoder_weights = self.bert_embeddings
                    self.masked_out_embed = self.out_embed * tf.expand_dims(
                        self.output_mask_ngpu[i], -1)
                    self.dec_attn_bias = attention_bias(
                        tf.shape(self.masked_out_embed)[1], 'causal')
                    self.decoder_input = tf.pad(
                        self.masked_out_embed,
                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
                    self.all_att_weights1, self.all_att_weights2, self.decoder_output = transformer_decoder_three(
                        self.decoder_input, self.encoder_output,
                        self.topic_word_memory, self.dec_attn_bias,
                        self.enc_attn_bias, self.topic_attn_bias, self.hps)
                    # [b, l_t, e] => [b*l_t, v]
                    self.decoder_output = tf.reshape(self.decoder_output,
                                                     [-1, hidden_size])
                    self.vocab_logits = tf.matmul(self.decoder_output,
                                                  self.decoder_weights, False,
                                                  True)  # (b * l_t, v)
                    self.vocab_probs = tf.nn.softmax(
                        self.vocab_logits)  # [b * l_t, v]
                    # vocab_size = len(self.hps.vocab)
                    with tf.variable_scope('copy'):
                        self.single_logits = calculate_two_copy_logits(
                            self.decoder_output, self.all_att_weights1,
                            self.vocab_probs, self.input_ids_oo_ngpu[i],
                            self.max_out_oovs, self.input_mask_ngpu[i],
                            out_dict_size, self.tiled_len,
                            self.all_att_weights2,
                            self.topic_words_ids_ngpu[i],
                            self.topic_words_mask_ngpu[i])  # [b * l_t, v + v']
                        self.single_pred_ids = tf.reshape(
                            tf.argmax(self.single_logits, axis=-1),
                            [self.batch_size, -1])

                with tf.variable_scope('loss'):
                    self.single_ce = smooth_cross_entropy(
                        self.single_logits, self.output_label_ngpu[i],
                        self.hps.label_smoothing)

                    self.single_ce = tf.reshape(
                        self.single_ce,
                        tf.shape(self.output_label_ngpu[i]))  # [b, l_t]

                    self.single_loss = tf.reduce_sum(
                        self.single_ce *
                        self.output_mask_ngpu[i]) / tf.reduce_sum(
                            self.output_mask_ngpu[i])  # scalar

                gpu_pred_ids.append(self.single_pred_ids)
                gpu_logits.append(self.single_logits)
                gpu_train_encoded.append(self.encoder_output)
                gpu_loss.append(self.single_loss)
                gpu_out_embed.append(self.out_embed)
                params = tf.trainable_variables()
                grads = tf.gradients(self.single_loss, params)
                grads = list(zip(grads, params))
                gpu_grads.append(grads)
                #gpu_ops.append([loss, logits])

        self.pred_ids = tf.concat(gpu_pred_ids, axis=0)
        self.logits = tf.concat(gpu_logits, axis=0)
        self.loss = tf.reduce_mean(gpu_loss)
        self.encoder_output = tf.concat(gpu_train_encoded, axis=0)
        self.out_embed = tf.concat(gpu_out_embed, axis=0)
        # end for
        grads = sum_grads(gpu_grads)
        grads = [g for g, p in grads]
        self.total_gradient = grads

        tf.summary.scalar('loss', self.loss)
Exemplo n.º 2
0
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        self._add_placeholders()

        '''Creates a classification model.'''
        model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=self.input_ids,
            input_mask=self.input_mask,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?

        encoder_output = model.get_sequence_output()  # [b, l_s, h]

        self.encoder_output = encoder_output

        hidden_size = encoder_output.shape[2].value

        self.enc_attn_bias = attention_bias(self.input_mask, 'masking')

        with tf.variable_scope('bert', reuse=True):
            with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                # Perform embedding lookup on the target word ids.
                (self.out_embed, self.bert_embeddings) = embedding_lookup(
                    input_ids=self.output_ids,  # here the embedding input of decoder have to be output_ids
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings',
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.out_embed = embedding_postprocessor(
                    input_tensor=self.out_embed,
                    use_token_type=True,
                    token_type_ids=self.out_segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

        with tf.variable_scope('decode'):
            self.decoder_weights = self.bert_embeddings
            self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1)
            self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal')
            self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
            self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input,
                                                                            self.encoder_output,
                                                                            self.dec_attn_bias,
                                                                            self.enc_attn_bias,
                                                                            self.hps)
            # [b, l_t, e] => [b*l_t, v]
            self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size])
            self.vocab_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True)  # (b * l_t, v)
            self.vocab_probs = tf.nn.softmax(self.vocab_logits)  # [b * l_t, v]
            vocab_size = len(self.hps.vocab)
            with tf.variable_scope('copy'):
                self.logits = calculate_final_logits(self.decoder_output, self.all_att_weights, self.vocab_probs,
                                                     self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size,
                                                     self.tiled_len)  # [b * l_t, v + v']

        with tf.variable_scope('loss'):
            self.ce = smooth_cross_entropy(
                self.logits,
                self.output_label,
                self.hps.label_smoothing)

            self.ce = tf.reshape(self.ce, tf.shape(self.output_label))  # [b, l_t]

            self.loss = tf.reduce_sum(self.ce * self.output_mask) / tf.reduce_sum(self.output_mask)  # scalar
            tf.summary.scalar('loss', self.loss)