def encode(self):
     model = modeling.BertModel(
         config=self.bert_config,
         is_training=self.is_training,
         input_ids=self.input_ids,
         input_mask=self.input_mask,
         token_type_ids=self.segment_ids,
         use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?
     encoder_output = model.get_sequence_output()  # [b, l_s, h]
     self.encoder_output = encoder_output
     self.enc_attn_bias = attention_bias(self.input_mask, 'masking')
예제 #2
0
    def _build_classify_model(self):
        is_training = self.is_training
        num_labels = self.batcher.label_num

        input_ids, input_mask, segment_ids, label_ids = self._add_placeholders(
        )
        self.input_ids, self.input_mask, self.segment_ids, self.label_ids = input_ids, input_mask, segment_ids, label_ids
        """Creates a classification model."""
        model = modeling.BertModel(config=self.bert_config,
                                   is_training=is_training,
                                   input_ids=input_ids,
                                   input_mask=input_mask,
                                   token_type_ids=segment_ids,
                                   use_one_hot_embeddings=self.hps.use_tpu
                                   )  #use_one_hot_embeddings=Flags.tpu ?

        output_layer = model.get_pooled_output()

        hidden_size = output_layer.shape[-1].value
        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("output_bias", [num_labels],
                                      initializer=tf.zeros_initializer())

        with tf.variable_scope("loss"):
            if is_training:
                # I.e., 0.1 dropout
                output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            one_hot_labels = tf.one_hot(label_ids,
                                        depth=num_labels,
                                        dtype=tf.float32)

            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            loss = tf.reduce_mean(per_example_loss)

        self.loss, self.per_example_loss, self.logits \
            = loss, per_example_loss, logits
        self.predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
예제 #3
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, logits)
예제 #4
0
    def _build_seq2seq_model(self):
        is_training = self.is_training
        num_labels = self.batcher.label_num

        input_ids, input_mask, segment_ids, tgt_ids, decode_input, decode_mask = self._add_placeholders(
        )
        """Creates a classification model."""
        model = modeling.BertModel(config=self.bert_config,
                                   is_training=is_training,
                                   input_ids=input_ids,
                                   input_mask=input_mask,
                                   token_type_ids=segment_ids,
                                   use_one_hot_embeddings=self.hps.use_tpu
                                   )  #use_one_hot_embeddings=Flags.tpu ?

        last_layer = model.get_sequence_output()

        self.last_layer = last_layer

        hidden_size = last_layer.shape[2].value
        seq_len = last_layer.shape[1].value
        batch_len = last_layer.shape[0].value

        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))
        output_bias = tf.get_variable('output_bias', [num_labels],
                                      initializer=tf.zeros_initializer())
        with tf.variable_scope('loss'):
            if is_training:
                last_layer = tf.nn.dropout(last_layer, keep_prob=0.9)
            # last_layer = tf.reshape(last_layer, [batch_len*seq_len, hidden_size])
            last_layer = tf.reshape(last_layer, [-1, hidden_size])

            logits = tf.matmul(last_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            log_probs = tf.reshape(log_probs, [-1, seq_len, num_labels])

            one_hot_tags = tf.one_hot(label_ids,
                                      depth=num_labels,
                                      dtype=tf.float32)

            per_seq_per_token_confidence = tf.reduce_sum(one_hot_tags *
                                                         log_probs,
                                                         axis=-1)

            per_seq_per_first_position_confidence_sum = tf.reduce_sum(
                per_seq_per_token_confidence * tag_position, axis=-1)

            average_loss_per_seq = -per_seq_per_first_position_confidence_sum / tf.reduce_sum(
                tag_position, axis=-1)
            loss = tf.reduce_mean(average_loss_per_seq)
            # loss = tf.Print(loss, [loss])

        self.loss, self.per_example_loss, self.logits \
            = loss, average_loss_per_seq, logits
        self.predictions = tf.reshape(tf.argmax(logits,
                                                axis=-1,
                                                output_type=tf.int32),
                                      shape=[-1, seq_len])
예제 #5
0
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        gpu_pred_ids = []
        gpu_logits = []
        gpu_train_encoded = []
        gpu_loss = []
        gpu_out_embed = []
        gpu_grads = []
        self._add_placeholders()
        self._n_gpu_split_placeholders(self.hps.n_gpu)

        for i in range(self.hps.n_gpu):
            do_reuse = True if i > 0 else None
            with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(
                    tf.get_variable_scope(), reuse=do_reuse):
                '''Creates a classification model.'''
                model = modeling.BertModel(
                    config=self.bert_config,
                    is_training=is_training,
                    input_ids=self.input_ids_ngpu[i],
                    input_mask=self.input_mask_ngpu[i],
                    token_type_ids=self.segment_ids_ngpu[i],
                    use_one_hot_embeddings=self.hps.use_tpu
                )  # use_one_hot_embeddings=Flags.tpu ?
                encoder_output = model.get_sequence_output()  # [b, l_s, h]
                self.enc_attn_bias = attention_bias(self.input_mask_ngpu[i],
                                                    'masking')

                hidden_size = encoder_output.shape[2].value
                encoder_out_length = tf.shape(encoder_output)[1]
                """Get topic word memory"""
                out_dict_size = len(self.hps.vocab_out)
                ## for topic word memory
                with tf.variable_scope('bert', reuse=True):
                    with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                        # Perform embedding lookup on the target word ids.
                        (topic_word_memory, _) = embedding_lookup(
                            input_ids=self.topic_words_ids_ngpu[
                                i],  # here the embedding input of decoder have to be output_ids
                            vocab_size=
                            out_dict_size,  # decode dictionary modified
                            embedding_size=config.hidden_size,
                            initializer_range=config.initializer_range,
                            word_embedding_name='word_embeddings',
                            use_one_hot_embeddings=False)
                        # Add positional embeddings and token type embeddings, then layer
                        # normalize and perform dropout.
                        self.topic_word_memory = embedding_postprocessor(
                            input_tensor=topic_word_memory,
                            use_token_type=True,
                            token_type_ids=self.mem_segment_ids_ngpu[i],
                            token_type_vocab_size=config.type_vocab_size,
                            token_type_embedding_name='token_type_embeddings',
                            use_position_embeddings=False,
                            position_embedding_name='position_embeddings',
                            initializer_range=config.initializer_range,
                            max_position_embeddings=config.
                            max_position_embeddings,
                            dropout_prob=config.hidden_dropout_prob)
                self.topic_attn_bias = attention_bias(
                    self.topic_words_mask_ngpu[i], 'masking')

                #print('topic_word_memory!!!!', self.topic_word_memory)
                #print('encoder_output_topic_emb!!!!', encoder_output_topic_emb)
                #print('self.topic_attn_bias!!!!', self.topic_attn_bias)
                #print('self.enc_attn_bias!!!!', self.enc_attn_bias)
                """encoder_topic_attention"""
                with tf.variable_scope("encoder_topic_attention"):
                    params = self.hps
                    y = multihead_attention(
                        layer_process(encoder_output, params.layer_preprocess),
                        self.topic_word_memory, self.topic_attn_bias,
                        params.num_heads, params.attention_key_channels
                        or params.hidden_size, params.attention_value_channels
                        or params.hidden_size, params.hidden_size,
                        params.attention_dropout)
                self.encoder_output = y["outputs"]
                """decoder"""
                with tf.variable_scope('bert', reuse=True):
                    with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                        # Perform embedding lookup on the target word ids.
                        (
                            self.out_embed, self.bert_embeddings
                        ) = embedding_lookup(
                            input_ids=self.output_ids_ngpu[
                                i],  # here the embedding input of decoder have to be output_ids
                            vocab_size=
                            out_dict_size,  # decode dictionary modified
                            embedding_size=config.hidden_size,
                            initializer_range=config.initializer_range,
                            word_embedding_name='word_embeddings',
                            use_one_hot_embeddings=False)

                        # Add positional embeddings and token type embeddings, then layer
                        # normalize and perform dropout.
                        self.out_embed = embedding_postprocessor(
                            input_tensor=self.out_embed,
                            use_token_type=True,
                            token_type_ids=self.out_segment_ids_ngpu[i],
                            token_type_vocab_size=config.type_vocab_size,
                            token_type_embedding_name='token_type_embeddings',
                            use_position_embeddings=True,
                            position_embedding_name='position_embeddings',
                            initializer_range=config.initializer_range,
                            max_position_embeddings=config.
                            max_position_embeddings,
                            dropout_prob=config.hidden_dropout_prob)

                with tf.variable_scope('decode'):
                    self.decoder_weights = self.bert_embeddings
                    self.masked_out_embed = self.out_embed * tf.expand_dims(
                        self.output_mask_ngpu[i], -1)
                    self.dec_attn_bias = attention_bias(
                        tf.shape(self.masked_out_embed)[1], 'causal')
                    self.decoder_input = tf.pad(
                        self.masked_out_embed,
                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
                    self.all_att_weights1, self.all_att_weights2, self.decoder_output = transformer_decoder_three(
                        self.decoder_input, self.encoder_output,
                        self.topic_word_memory, self.dec_attn_bias,
                        self.enc_attn_bias, self.topic_attn_bias, self.hps)
                    # [b, l_t, e] => [b*l_t, v]
                    self.decoder_output = tf.reshape(self.decoder_output,
                                                     [-1, hidden_size])
                    self.vocab_logits = tf.matmul(self.decoder_output,
                                                  self.decoder_weights, False,
                                                  True)  # (b * l_t, v)
                    self.vocab_probs = tf.nn.softmax(
                        self.vocab_logits)  # [b * l_t, v]
                    # vocab_size = len(self.hps.vocab)
                    with tf.variable_scope('copy'):
                        self.single_logits = calculate_two_copy_logits(
                            self.decoder_output, self.all_att_weights1,
                            self.vocab_probs, self.input_ids_oo_ngpu[i],
                            self.max_out_oovs, self.input_mask_ngpu[i],
                            out_dict_size, self.tiled_len,
                            self.all_att_weights2,
                            self.topic_words_ids_ngpu[i],
                            self.topic_words_mask_ngpu[i])  # [b * l_t, v + v']
                        self.single_pred_ids = tf.reshape(
                            tf.argmax(self.single_logits, axis=-1),
                            [self.batch_size, -1])

                with tf.variable_scope('loss'):
                    self.single_ce = smooth_cross_entropy(
                        self.single_logits, self.output_label_ngpu[i],
                        self.hps.label_smoothing)

                    self.single_ce = tf.reshape(
                        self.single_ce,
                        tf.shape(self.output_label_ngpu[i]))  # [b, l_t]

                    self.single_loss = tf.reduce_sum(
                        self.single_ce *
                        self.output_mask_ngpu[i]) / tf.reduce_sum(
                            self.output_mask_ngpu[i])  # scalar

                gpu_pred_ids.append(self.single_pred_ids)
                gpu_logits.append(self.single_logits)
                gpu_train_encoded.append(self.encoder_output)
                gpu_loss.append(self.single_loss)
                gpu_out_embed.append(self.out_embed)
                params = tf.trainable_variables()
                grads = tf.gradients(self.single_loss, params)
                grads = list(zip(grads, params))
                gpu_grads.append(grads)
                #gpu_ops.append([loss, logits])

        self.pred_ids = tf.concat(gpu_pred_ids, axis=0)
        self.logits = tf.concat(gpu_logits, axis=0)
        self.loss = tf.reduce_mean(gpu_loss)
        self.encoder_output = tf.concat(gpu_train_encoded, axis=0)
        self.out_embed = tf.concat(gpu_out_embed, axis=0)
        # end for
        grads = sum_grads(gpu_grads)
        grads = [g for g, p in grads]
        self.total_gradient = grads

        tf.summary.scalar('loss', self.loss)
예제 #6
0
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        self._add_placeholders()

        '''Creates a classification model.'''
        model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=self.input_ids,
            input_mask=self.input_mask,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?

        encoder_output = model.get_sequence_output()  # [b, l_s, h]

        self.encoder_output = encoder_output

        hidden_size = encoder_output.shape[2].value

        self.enc_attn_bias = attention_bias(self.input_mask, 'masking')

        with tf.variable_scope('bert', reuse=True):
            with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                # Perform embedding lookup on the target word ids.
                (self.out_embed, self.bert_embeddings) = embedding_lookup(
                    input_ids=self.output_ids,  # here the embedding input of decoder have to be output_ids
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings',
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.out_embed = embedding_postprocessor(
                    input_tensor=self.out_embed,
                    use_token_type=True,
                    token_type_ids=self.out_segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

        with tf.variable_scope('decode'):
            self.decoder_weights = self.bert_embeddings
            self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1)
            self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal')
            self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
            self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input,
                                                                            self.encoder_output,
                                                                            self.dec_attn_bias,
                                                                            self.enc_attn_bias,
                                                                            self.hps)
            # [b, l_t, e] => [b*l_t, v]
            self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size])
            self.vocab_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True)  # (b * l_t, v)
            self.vocab_probs = tf.nn.softmax(self.vocab_logits)  # [b * l_t, v]
            vocab_size = len(self.hps.vocab)
            with tf.variable_scope('copy'):
                self.logits = calculate_final_logits(self.decoder_output, self.all_att_weights, self.vocab_probs,
                                                     self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size,
                                                     self.tiled_len)  # [b * l_t, v + v']

        with tf.variable_scope('loss'):
            self.ce = smooth_cross_entropy(
                self.logits,
                self.output_label,
                self.hps.label_smoothing)

            self.ce = tf.reshape(self.ce, tf.shape(self.output_label))  # [b, l_t]

            self.loss = tf.reduce_sum(self.ce * self.output_mask) / tf.reduce_sum(self.output_mask)  # scalar
            tf.summary.scalar('loss', self.loss)
예제 #7
0
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        self._add_placeholders()
        '''Creates a classification model.'''
        model = modeling.BertModel(config=self.bert_config,
                                   is_training=is_training,
                                   input_ids=self.input_ids,
                                   input_mask=self.input_mask,
                                   token_type_ids=self.segment_ids,
                                   use_one_hot_embeddings=self.hps.use_tpu
                                   )  # use_one_hot_embeddings=Flags.tpu ?

        encoder_output = model.get_sequence_output()  # [b, l_s, h]

        self.encoder_output = encoder_output

        hidden_size = encoder_output.shape[2].value

        self.enc_attn_bias = attention_bias(self.input_mask, 'masking')

        with tf.variable_scope('bert', reuse=True):
            with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                # Perform embedding lookup on the target word ids.
                (self.out_embed, self.bert_embeddings) = embedding_lookup(
                    input_ids=self.output_ids,
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings',
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.out_embed = embedding_postprocessor(
                    input_tensor=self.out_embed,
                    use_token_type=True,
                    token_type_ids=self.out_segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

        with tf.variable_scope('decoder_1'):
            self.decoder_weights = self.bert_embeddings
            self.masked_out_embed = self.out_embed * tf.expand_dims(
                self.output_mask, -1)
            self.decoder_input = tf.pad(
                self.masked_out_embed,
                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
            # ################################################### decoding train - 1
            self.dec_attn_bias = attention_bias(
                tf.shape(self.masked_out_embed)[1], 'causal')
            self.all_att_weights, self.decoder_output_1 = transformer_decoder(
                self.decoder_input,
                self.encoder_output,
                self.dec_attn_bias,
                self.enc_attn_bias,
                self.hps,
                scope='decoder_1')
            # [b, l_t, e] => [b*l_t, v]
            self.decoder_output_1 = tf.reshape(self.decoder_output_1,
                                               [-1, hidden_size])
            self.vocab_logits = tf.matmul(self.decoder_output_1,
                                          self.decoder_weights, False,
                                          True)  # (b*l_t, v)
            self.vocab_probs = tf.nn.softmax(self.vocab_logits)  # [b * l_t, v]
            vocab_size = len(self.hps.vocab)
            with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                self.logits = calculate_final_logits(
                    self.decoder_output_1, self.all_att_weights,
                    self.vocab_probs, self.input_ids_oo, self.max_out_oovs,
                    self.input_mask, vocab_size,
                    self.tiled_len)  # [b * l_t, v + v']
                self.pred_ids = tf.reshape(tf.argmax(self.logits, axis=-1),
                                           [self.batch_size, -1])  # [b, l_t]

        draft = self.trunct(
            self.pred_ids
        )  # as the draft may have copy words, we transform them to UNK first
        draft = tf.cast(draft, tf.int32)
        changed_ids = tf.concat([self.output_ids, draft],
                                axis=-1)  # [b, 2 * l_t]
        change_segment_ids = tf.zeros_like(changed_ids,
                                           dtype=tf.int32,
                                           name='change_segment_ids')

        def calcu_id_len(input_tensor):
            step_size = tf.constant(0.001)
            a = input_tensor
            res = tf.argmin(
                tf.cast(a, tf.float32) +
                tf.cast(tf.range(0,
                                 tf.shape(a)[-1]), tf.float32) * step_size,
                -1) + 1
            return res

        pred_ids_len = calcu_id_len(draft)  # [b,]
        pred_ids_mask_w_draft = tf.sequence_mask(pred_ids_len,
                                                 maxlen=tf.shape(draft)[1],
                                                 dtype=tf.float32)  # [b, l_t]
        pred_ids_mask_wo_draft = tf.zeros_like(draft, dtype=tf.float32)
        pred_ids_mask = tf.cond(self.feed_draft, lambda: pred_ids_mask_w_draft,
                                lambda: pred_ids_mask_wo_draft)
        change_ids_mask = tf.concat([self.output_mask, pred_ids_mask],
                                    axis=-1)  # [b, 2 * l_t]

        transferred_mask = create_attention_mask_from_input_mask(
            changed_ids, change_ids_mask)  # [b, 2 * l_t, 2 * l_t]

        self.second_dec_attn_bias_w_draft = attention_bias(
            tf.shape(changed_ids)[1], 'mask_draft')
        self.second_dec_attn_bias_wo_draft = attention_bias(
            tf.shape(changed_ids)[1], 'mask_draft_warmup')
        self.second_dec_attn_bias = tf.cond(
            self.feed_draft, lambda: self.second_dec_attn_bias_w_draft, lambda:
            self.second_dec_attn_bias_wo_draft)  # [1, 1, 2 * l_t, 2 *l_t]
        self.second_dec_attn_bias = tf.tile(
            self.second_dec_attn_bias, [tf.shape(self.output_ids)[0], 1, 1, 1
                                        ])  # [b, 1, 2 * l_t, 2 * l_t]

        self.second_dec_attn_bias = self.second_dec_attn_bias * tf.expand_dims(
            transferred_mask, 1)  # [b, 1, 2 * l_t, 2 * l_t]

        dec_model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=changed_ids,
            input_mask=tf.squeeze(self.second_dec_attn_bias,
                                  1),  # [b, 2 * l_t, 2 * l_t]
            token_type_ids=change_segment_ids,
            scope='bert',
            reuse=tf.AUTO_REUSE,
            use_one_hot_embeddings=self.hps.use_tpu
        )  # use_one_hot_embeddings=Flags.tpu ?
        dec_output = dec_model.get_sequence_output()  # [b, l_t, h]
        self.out_embed = dec_output
        self.masked_out_embed = self.out_embed * tf.expand_dims(
            change_ids_mask, -1)
        self.decoder_input = tf.pad(
            self.masked_out_embed,
            [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
        # ################################################### decoding train - 2
        with tf.variable_scope('decoder_2'):
            self.all_att_weights, self.decoder_output_2 = transformer_decoder(
                self.decoder_input,
                self.encoder_output, (1.0 - self.second_dec_attn_bias) * -1e9,
                self.enc_attn_bias,
                self.hps,
                scope='decoder_2')
            # [b, 2 * l_t, e] => [b, l_t, e] => [b * l_t, v]
            target_len = tf.shape(self.output_ids)[1]
            # keep only ground-truth part for attention weight & decoder output
            self.all_att_weights[-1] = self.all_att_weights[
                -1][:, :target_len, :]  # [b, l_t, l_s]
            self.decoder_output_2 = self.decoder_output_2[:, :
                                                          target_len, :]  # [b, l_t, v]
            self.decoder_output_2 = tf.reshape(self.decoder_output_2,
                                               [-1, hidden_size])
            self.second_logits = tf.matmul(self.decoder_output_2,
                                           self.decoder_weights, False,
                                           True)  # (b*l_t, v)
            self.vocab_probs_2 = tf.nn.softmax(
                self.second_logits)  # [b * l_t, v]
            with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                self.second_logits = calculate_final_logits(
                    self.decoder_output_2, self.all_att_weights,
                    self.vocab_probs_2, self.input_ids_oo, self.max_out_oovs,
                    self.input_mask, vocab_size,
                    self.tiled_len)  # [b * l_t, v + v']

        with tf.variable_scope('loss'):
            self.ce = smooth_cross_entropy(self.logits, self.output_label,
                                           self.hps.label_smoothing)

            self.ce = tf.reshape(self.ce,
                                 tf.shape(self.output_label))  # [b, l_t]

            mle_1 = tf.reduce_sum(
                self.ce * self.output_mask, -1) / tf.reduce_sum(
                    self.output_mask, -1)  # [b]

            self.first_loss = tf.reduce_sum(self.ce * self.output_mask,
                                            -1) / tf.reduce_sum(
                                                self.output_mask, -1)
            self.first_loss = tf.reduce_mean(self.first_loss)  # scalar

            self.second_ce = smooth_cross_entropy(self.second_logits,
                                                  self.output_label,
                                                  self.hps.label_smoothing)

            self.second_ce = tf.reshape(self.second_ce,
                                        tf.shape(
                                            self.output_label))  # [b, l_t]

            output_mask = self.output_mask

            mle_2 = tf.reduce_sum(self.second_ce * output_mask, -1) / (
                tf.reduce_sum(output_mask, -1))  # [b]

            self.second_loss = tf.reduce_mean(
                tf.reduce_sum(self.second_ce * output_mask, -1) /
                (tf.reduce_sum(output_mask, -1)))  # scalar

            mle = mle_1 + mle_2
            self.rl_loss = tf.reduce_mean(mle * self.reward)  # scalar
            self.ml_loss = self.first_loss + self.second_loss
            self.loss = self.hps.rl_lambda * self.rl_loss + (
                1 - self.hps.rl_lambda) * self.ml_loss
            tf.summary.scalar('first_loss', self.first_loss)
            tf.summary.scalar('second_loss', self.second_loss)
            tf.summary.scalar('reward', tf.reduce_mean(self.reward))
            tf.summary.scalar('rl_loss', self.rl_loss)
            tf.summary.scalar('ml_loss', self.ml_loss)
            tf.summary.scalar('loss', self.loss)
예제 #8
0
    def decode_infer_2(self):
        # stage 2, word level inference using decoded sequence
        # l_t = decode sequence length
        # during infer, following graph are constructed using beam search
        hidden_size = self.bert_config.hidden_size
        with self.graph.as_default():
            target_sequence = tf.squeeze(self.decode_seq, axis=1)
            draft = self.trunct(
                target_sequence
            )  # as the draft may have copy words, we transform them to UNK first
            target_sequence = self.trunct(target_sequence)
            target_length = self.decode_length
            tgt_mask = tf.sequence_mask(target_length,
                                        maxlen=tf.shape(target_sequence)[1],
                                        dtype=tf.float32)  # [b, q']

            draft = tf.cast(draft, tf.int32)
            changed_ids = tf.concat([target_sequence, draft],
                                    axis=-1)  # [b, 2 * l_t]
            change_segment_ids = tf.zeros_like(changed_ids,
                                               dtype=tf.int32,
                                               name='change_segment_ids')

            def calcu_id_len(input_tensor):
                step_size = tf.constant(0.001)
                a = input_tensor
                res = tf.argmin(
                    tf.cast(a, tf.float32) +
                    tf.cast(tf.range(0,
                                     tf.shape(a)[-1]), tf.float32) * step_size,
                    -1) + 1
                return res

            pred_ids_len = calcu_id_len(draft)  # [b,]
            pred_ids_mask_w_draft = tf.sequence_mask(
                pred_ids_len, maxlen=tf.shape(draft)[1],
                dtype=tf.float32)  # [b, l_t]
            pred_ids_mask = pred_ids_mask_w_draft
            change_ids_mask = tf.concat([tgt_mask, pred_ids_mask],
                                        axis=-1)  # [b, 2 * l_t]

            transferred_mask = create_attention_mask_from_input_mask(
                changed_ids, change_ids_mask)  # [b, 2 * l_t, 2 * l_t]

            second_dec_attn_bias_w_draft = attention_bias(
                tf.shape(changed_ids)[1], 'mask_draft')
            second_dec_attn_bias = second_dec_attn_bias_w_draft  # [1, 1, 2 * l_t, 2 *l_t]
            second_dec_attn_bias = tf.tile(
                second_dec_attn_bias, [tf.shape(target_sequence)[0], 1, 1, 1
                                       ])  # [b, 1, 2 * l_t, 2 * l_t]

            second_dec_attn_bias = second_dec_attn_bias * tf.expand_dims(
                transferred_mask, 1)  # [b, 1, 2 * l_t, 2 * l_t]
            is_training = self.is_training
            dec_model = modeling.BertModel(
                config=self.bert_config,
                is_training=is_training,
                input_ids=changed_ids,
                input_mask=tf.squeeze(second_dec_attn_bias,
                                      1),  # [b, 2 * l_t, 2 * l_t]
                token_type_ids=change_segment_ids,
                scope='bert',
                reuse=tf.AUTO_REUSE,
                use_one_hot_embeddings=self.hps.use_tpu)

            dec_output = dec_model.get_sequence_output()  # [b, l_t, h]
            tgt_embed = dec_output

            with tf.variable_scope('decoder_2', reuse=True):
                masked_tgt_embed = tgt_embed * tf.expand_dims(
                    change_ids_mask, -1)
                infer_decoder_input = tf.pad(
                    masked_tgt_embed,
                    [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
                all_att_weights, decoder_output = transformer_decoder(
                    infer_decoder_input,
                    self.enc_output, (1.0 - second_dec_attn_bias) * -1e9,
                    self.enc_attn_bias,
                    self.hps,
                    scope='decoder_2')
                # [b, l_t, e] => [b*l_t, v]
                target_len = tf.shape(target_sequence)[1]
                # keep only ground-truth part for attention weight & decoder output
                all_att_weights[-1] = all_att_weights[
                    -1][:, :target_len, :]  # [b, l_t, l_s]
                decoder_output = decoder_output[:, :
                                                target_len, :]  # [b, l_t, v]

                decoder_output = tf.reshape(decoder_output, [-1, hidden_size])
                second_logits = tf.matmul(decoder_output, self.decoder_weights,
                                          False, True)  # (b*l_t, v)
                vocab_probs = tf.nn.softmax(second_logits)  # [b * l_t, v]
                vocab_size = len(self.hps.vocab)
                with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                    logits = calculate_final_logits(
                        decoder_output, all_att_weights, vocab_probs,
                        self.input_ids_oo, self.max_out_oovs, self.input_mask,
                        vocab_size, self.infer_tiled_len)  # [b * l_t, v + v']
                second_log_prob = tf.log(logits)
                # (b, l_t, v)
                extend_vocab_size = tf.add(tf.constant(vocab_size),
                                           self.max_out_oovs)
                second_log_prob = tf.reshape(
                    second_log_prob,
                    [-1, tf.shape(target_sequence)[1], extend_vocab_size])
                second_log_id = tf.argmax(second_log_prob, axis=-1)  # (b, l_t)
        return second_log_id
예제 #9
0
    def decode_infer_2_bs(self):
        # beam search version
        # during second stage decoding, we have a decoded sequence, so do not need to feed state(no incremental dec)
        # at time i, we calculate i-th attn_bias, get i-th decoder output
        with self.graph.as_default():
            target_sequence = tf.reshape(
                self.decode_seq,
                [self.hps.eval_batch_size * self.hps.beam_size, -1])
            draft = self.trunct(
                target_sequence
            )  # as the draft may have copy words, we transform them to UNK first
            target_sequence = self.trunct(target_sequence)
            target_length = self.decode_length
            tgt_mask = tf.sequence_mask(target_length,
                                        maxlen=tf.shape(target_sequence)[1],
                                        dtype=tf.float32)  # [b, l_t]
            draft = tf.cast(draft, tf.int32)
            changed_ids = tf.concat([target_sequence, draft],
                                    axis=-1)  # [b, 2 * l_t]
            change_segment_ids = tf.zeros_like(changed_ids,
                                               dtype=tf.int32,
                                               name='change_segment_ids')

            def calcu_id_len(input_tensor):
                step_size = tf.constant(0.001)
                a = input_tensor
                res = tf.argmin(
                    tf.cast(a, tf.float32) +
                    tf.cast(tf.range(0,
                                     tf.shape(a)[-1]), tf.float32) * step_size,
                    -1) + 1
                return res

            pred_ids_len = calcu_id_len(draft)  # [b,]
            pred_ids_mask_w_draft = tf.sequence_mask(
                pred_ids_len, maxlen=tf.shape(draft)[1],
                dtype=tf.float32)  # [b, l_t]
            pred_ids_mask = pred_ids_mask_w_draft
            change_ids_mask = tf.concat([tgt_mask, pred_ids_mask],
                                        axis=-1)  # [b, 2 * l_t]

            transferred_mask = create_attention_mask_from_input_mask(
                changed_ids, change_ids_mask)  # [b, 2 * l_t, 2 * l_t]

            second_dec_attn_bias_w_draft = attention_bias(
                tf.shape(changed_ids)[1], 'mask_draft')
            second_dec_attn_bias = second_dec_attn_bias_w_draft  # [1, 1, 2 * l_t, 2 *l_t]
            second_dec_attn_bias = tf.tile(
                second_dec_attn_bias, [tf.shape(target_sequence)[0], 1, 1, 1
                                       ])  # [b, 1, 2 * l_t, 2 * l_t]

            second_dec_attn_bias = second_dec_attn_bias * tf.expand_dims(
                transferred_mask, 1)  # [b, 1, 2 * l_t, 2 * l_t]
            is_training = self.is_training
            dec_model = modeling.BertModel(
                config=self.bert_config,
                is_training=is_training,
                input_ids=changed_ids,
                input_mask=tf.squeeze(second_dec_attn_bias,
                                      1),  # [b, 2 * l_t, 2 * l_t]
                token_type_ids=change_segment_ids,
                scope='bert',
                reuse=tf.AUTO_REUSE,
                use_one_hot_embeddings=self.hps.use_tpu)

            dec_output = dec_model.get_sequence_output()  # [b, l_t, h]
            tgt_embed = dec_output

            with tf.variable_scope('decoder_2', reuse=True):
                # [b, l_t, e]
                masked_tgt_embed = tgt_embed * tf.expand_dims(
                    change_ids_mask, -1)
                infer_decoder_input = tf.pad(
                    masked_tgt_embed,
                    [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left

                all_att_weights, decoder_output = transformer_decoder(
                    infer_decoder_input,
                    self.enc_output, (1.0 - second_dec_attn_bias) * -1e9,
                    self.enc_attn_bias,
                    self.hps,
                    scope='decoder_2')

                decoder_output = decoder_output[:, self.
                                                time_step, :]  # [b * beam, e]
                all_att_weights[-1] = all_att_weights[-1][:, self.time_step, :]

                second_logits = tf.matmul(decoder_output, self.decoder_weights,
                                          False, True)  # (b*beam, v)
                vocab_probs = tf.nn.softmax(second_logits)  # [b * beam, v]
                vocab_size = len(self.hps.vocab)
                with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                    logits = calculate_final_logits(
                        decoder_output, all_att_weights, vocab_probs,
                        self.input_ids_oo, self.max_out_oovs, self.input_mask,
                        vocab_size, 1)  # [b * beam, v + v']
                second_log_prob = tf.log(logits)
        return second_log_prob
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        self._add_placeholders()

        '''Creates a classification model.'''
        model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=self.input_ids,
            input_mask=self.input_mask,
            token_type_ids=self.segment_ids,
            scope='bert',
            use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?

        encoder_output = model.get_sequence_output()  # [b, l_s, h]

        self.encoder_output = encoder_output

        hidden_size = encoder_output.shape[2].value

        self.enc_attn_bias = attention_bias(self.input_mask, 'masking')

        with tf.variable_scope('bert', reuse=True):
            with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                # Perform embedding lookup on the target word ids.
                (self.out_embed, self.bert_embeddings) = embedding_lookup(
                    input_ids=self.output_ids,
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings',
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.out_embed = embedding_postprocessor(
                    input_tensor=self.out_embed,
                    use_token_type=True,
                    token_type_ids=self.out_segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

        '''Creates a lm model.'''
        lm_model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=self.lm_output_ids,
            input_mask=self.lm_output_mask,
            token_type_ids=self.lm_out_segment_ids,
            use_one_hot_embeddings=self.hps.use_tpu,  # use_one_hot_embeddings=Flags.tpu ?
            scope='bert', reuse=True, on_cpu=True,
            use_lm=True, lm_position=self.lm_position)

        with tf.variable_scope('decoder'):
            self.decoder_weights = self.bert_embeddings
            self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1)
            self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal')
            self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
            self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input, self.encoder_output,
                                                                            self.dec_attn_bias, self.enc_attn_bias,
                                                                            self.hps, scope='t_decoder')
            # [b, l_t, e] => [b*l_t, v]
            self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size])
            self.logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True)  # (b*l_t, v)

            self.second_dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'cloze_bias')
            self.all_att_weights, self.decoder_output = transformer_decoder(self.decoder_input, self.encoder_output,
                                                                            self.second_dec_attn_bias,
                                                                            self.enc_attn_bias,
                                                                            self.hps, scope='t_decoder', reuse=True)
            # [b, l_t, e] => [b*l_t, v]
            self.decoder_output = tf.reshape(self.decoder_output, [-1, hidden_size])
            self.second_logits = tf.matmul(self.decoder_output, self.decoder_weights, False, True)  # (b*l_t, v)
            self.lm_logits = lm_model.get_lm_output()  # (b*l_t, v)
            self.lm_logits = tf.stop_gradient(self.lm_logits)
            # use pooled output to represent the original input sequence
            self.pooled_output = model.get_pooled_output()  # (b, e)
            self.article_representation = tf.reshape(tf.tile(tf.expand_dims(self.pooled_output, 1),
                                                             [1, tf.shape(self.lm_output_ids)[1], 1]),
                                                     [-1, self.bert_config.hidden_size])  # (b * l_t, e)
            self.masked_summary_representation = lm_model.get_pooled_output()  # (b * l_t, e)
            self.concated_representation = tf.concat([self.article_representation, self.masked_summary_representation],
                                                     axis=-1)  # (b * l_t, 2e)
            self.lm_prob = tf.nn.sigmoid(linear(self.concated_representation, 1))  # (b * l_t, 1)
            self.final_second_logits = self.lm_prob * self.second_logits + (1 - self.lm_prob) * self.lm_logits

        with tf.variable_scope('loss'):
            self.ce = smoothed_softmax_cross_entropy(
                self.logits,
                self.output_ids,
                self.hps.label_smoothing,
                True
            )

            self.ce = tf.reshape(self.ce, tf.shape(self.output_ids))  # [b, l_t]

            self.first_loss = tf.reduce_sum(self.ce * self.output_mask) / tf.reduce_sum(self.output_mask)  # scalar

            self.second_ce = smoothed_softmax_cross_entropy(
                self.final_second_logits,
                self.output_ids,
                self.hps.label_smoothing,
                True
            )

            self.second_ce = tf.reshape(self.second_ce, tf.shape(self.output_ids))  # [b, l_t]

            self.second_loss = tf.reduce_sum(self.second_ce * self.output_mask) / tf.reduce_sum(
                self.output_mask)  # scalar

            self.loss = self.first_loss + self.second_loss
            tf.summary.scalar('first_loss', self.first_loss)
            tf.summary.scalar('second_loss', self.second_loss)
            tf.summary.scalar('loss', self.loss)
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        self._add_placeholders()

        '''Creates a classification model.'''
        model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=self.input_ids,
            input_mask=self.input_mask,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?

        batch_size = tf.shape(self.output_ids)[0]
        tgt_len = tf.shape(self.output_ids)[1]

        rnd_value = tf.random_uniform([batch_size, tgt_len], minval=0, maxval=1)
        replace_to_mask = rnd_value < 0.1
        replace_to_random_word = (rnd_value > 0.1) & (rnd_value < 0.15)
        keep_the_word = rnd_value < 0.2
        keep_the_word = tf.cast(keep_the_word, tf.float32)

        all_mask = tf.ones_like(self.output_ids, dtype=tf.int32)
        mask_id = self.hps.maskId
        all_mask = all_mask * mask_id

        all_random_word_id = tf.random_uniform([batch_size, tgt_len], minval=999, maxval=30521, dtype=tf.int32)

        changed_ids = self.output_ids
        changed_ids = tf.where(replace_to_mask, all_mask, changed_ids)
        changed_ids = tf.where(replace_to_random_word, all_random_word_id, changed_ids)

        dec_model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=changed_ids,
            input_mask=self.output_mask,
            token_type_ids=self.out_segment_ids,
            scope='bert',
            reuse=tf.AUTO_REUSE,
            use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?

        encoder_output = model.get_sequence_output()  # [b, l_s, h]
        dec_output = dec_model.get_sequence_output()  # [b, l_t, h]

        self.encoder_output = encoder_output

        hidden_size = encoder_output.shape[2].value

        self.enc_attn_bias = attention_bias(self.input_mask, 'masking')

        with tf.variable_scope('bert', reuse=True):
            with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                # Perform embedding lookup on the target word ids.
                (self.out_embed, self.bert_embeddings) = embedding_lookup(
                    input_ids=self.output_ids,
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings',
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.out_embed = embedding_postprocessor(
                    input_tensor=self.out_embed,
                    use_token_type=True,
                    token_type_ids=self.out_segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

        with tf.variable_scope('decoder'):
            self.decoder_weights = self.bert_embeddings
            self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1)
            self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
            # ################################################### decoding train - 1
            self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal')
            self.all_att_weights, self.decoder_output_1 = transformer_decoder(self.decoder_input, self.encoder_output,
                                                                              self.dec_attn_bias, self.enc_attn_bias,
                                                                              self.hps, scope='t_decoder')
            # [b, l_t, e] => [b*l_t, v]
            self.decoder_output_1 = tf.reshape(self.decoder_output_1, [-1, hidden_size])
            self.vocab_logits = tf.matmul(self.decoder_output_1, self.decoder_weights, False, True)  # (b*l_t, v)
            self.vocab_probs = tf.nn.softmax(self.vocab_logits)  # [b * l_t, v]
            vocab_size = len(self.hps.vocab)
            with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                self.logits = calculate_final_logits(self.decoder_output_1, self.all_att_weights, self.vocab_probs,
                                                     self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size,
                                                     self.tiled_len)  # [b * l_t, v + v']
                self.pred_ids = tf.reshape(tf.argmax(self.logits, axis=-1), [self.batch_size, -1])  # [b, l_t]

            self.out_embed = dec_output
            self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1)
            self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
            # ################################################### decoding train - 2
            self.second_dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'cloze_bias')
            self.all_att_weights, self.decoder_output_2 = transformer_decoder(self.decoder_input, self.encoder_output,
                                                                              self.second_dec_attn_bias,
                                                                              self.enc_attn_bias,
                                                                              self.hps, scope='t_decoder', reuse=True)
            # [b, l_t, e] => [b*l_t, v]
            self.decoder_output_2 = tf.reshape(self.decoder_output_2, [-1, hidden_size])
            self.second_logits = tf.matmul(self.decoder_output_2, self.decoder_weights, False, True)  # (b*l_t, v)
            self.vocab_probs_2 = tf.nn.softmax(self.second_logits)  # [b * l_t, v]
            with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                self.second_logits = calculate_final_logits(self.decoder_output_2, self.all_att_weights,
                                                            self.vocab_probs_2,
                                                            self.input_ids_oo, self.max_out_oovs, self.input_mask,
                                                            vocab_size,
                                                            self.tiled_len)  # [b * l_t, v + v']

            # ################################################### decoding train - 3
            # self.all_att_weights, self.decoder_output_3 = transformer_decoder(self.decoder_input, self.encoder_output,
            #                                                                   self.sent_level_attn_bias,
            #                                                                   self.enc_attn_bias,
            #                                                                   self.hps, scope='t_decoder', reuse=True)
            # # [b, l_t, e] => [b*l_t, v]
            # self.decoder_output_3 = tf.reshape(self.decoder_output_3, [-1, hidden_size])
            # self.third_logits = tf.matmul(self.decoder_output_3, self.decoder_weights, False, True)  # (b*l_t, v)
            # self.vocab_probs_3 = tf.nn.softmax(self.third_logits)  # [b * l_t, v]
            # with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
            #     self.third_logits = calculate_final_logits(self.decoder_output_3, self.all_att_weights,
            #                                                self.vocab_probs_3,
            #                                                self.input_ids_oo, self.max_out_oovs, self.input_mask,
            #                                                vocab_size,
            #                                                self.tiled_len)  # [b * l_t, v + v']

        with tf.variable_scope('loss'):
            self.ce = smooth_cross_entropy(
                self.logits,
                self.output_label,
                self.hps.label_smoothing)

            self.ce = tf.reshape(self.ce, tf.shape(self.output_label))  # [b, l_t]

            mle_1 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1)  # [b]

            self.first_loss = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1)
            self.first_loss = tf.reduce_mean(self.first_loss)  # scalar

            self.second_ce = smooth_cross_entropy(
                self.second_logits,
                self.output_label,
                self.hps.label_smoothing)

            self.second_ce = tf.reshape(self.second_ce, tf.shape(self.output_label))  # [b, l_t]

            output_mask = self.output_mask * keep_the_word

            mle_2 = tf.reduce_sum(self.second_ce * output_mask, -1) / (tf.reduce_sum(output_mask, -1) + 0.001)  # [b]

            self.second_loss = tf.reduce_mean(tf.reduce_sum(self.second_ce * output_mask, -1) / (tf.reduce_sum(
                output_mask, -1) + 0.001))  # scalar

            # self.ce = smooth_cross_entropy(
            #     self.third_logits,
            #     self.output_ids,
            #     self.hps.label_smoothing)
            #
            # self.ce = tf.reshape(self.ce, tf.shape(self.output_label))  # [b, l_t]
            #
            # mle_3 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1)  # [b]
            #
            # self.third_loss = tf.reduce_mean(tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(
            #     self.output_mask, -1))  # scalar

            mle = mle_1 + mle_2
            self.rl_loss = tf.reduce_mean(mle * self.reward)  # scalar
            self.ml_loss = self.first_loss + self.second_loss
            self.loss = self.hps.rl_lambda * self.rl_loss + (1 - self.hps.rl_lambda) * self.ml_loss
            tf.summary.scalar('first_loss', self.first_loss)
            tf.summary.scalar('second_loss', self.second_loss)
            # tf.summary.scalar('third_loss', self.third_loss)
            tf.summary.scalar('reward', tf.reduce_mean(self.reward))
            tf.summary.scalar('rl_loss', self.rl_loss)
            tf.summary.scalar('ml_loss', self.ml_loss)
            tf.summary.scalar('loss', self.loss)
    def decode_infer_2(self):
        # stage 2, word level inference using decoded sequence
        # l_t = decode sequence length
        # during infer, following graph are constructed using beam search
        hidden_size = self.bert_config.hidden_size
        with self.graph.as_default():
            target_sequence = tf.squeeze(self.decode_seq, axis=1)
            target_sequence = self.trunct(target_sequence)
            target_length = self.decode_length
            target_seg_ids = tf.zeros_like(target_sequence, dtype=tf.int32, name='target_seg_ids_infer_2')
            tgt_mask = tf.sequence_mask(target_length,
                                        maxlen=tf.shape(target_sequence)[1],
                                        dtype=tf.float32)  # [b, q']

            is_training = self.is_training
            dec_model = modeling.BertModel(
                config=self.bert_config,
                is_training=is_training,
                input_ids=target_sequence,
                input_mask=tgt_mask,
                token_type_ids=target_seg_ids,
                scope='bert',
                reuse=tf.AUTO_REUSE,
                use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?

            dec_output = dec_model.get_sequence_output()  # [b, l_t, h]
            tgt_embed = dec_output

            # with tf.variable_scope('bert', reuse=True):
            #     with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
            #         # Perform embedding lookup on the target word ids.
            #         (tgt_embed, _) = embedding_lookup(
            #             input_ids=target_sequence,
            #             vocab_size=config.vocab_size,
            #             embedding_size=config.hidden_size,
            #             initializer_range=config.initializer_range,
            #             word_embedding_name='word_embeddings',
            #             use_one_hot_embeddings=False)
            #
            #         # Add positional embeddings and token type embeddings, then layer
            #         # normalize and perform dropout.
            #         tgt_embed = embedding_postprocessor(
            #             input_tensor=tgt_embed,
            #             use_token_type=True,
            #             token_type_ids=target_seg_ids,
            #             token_type_vocab_size=config.type_vocab_size,
            #             token_type_embedding_name='token_type_embeddings',
            #             use_position_embeddings=True,
            #             position_embedding_name='position_embeddings',
            #             initializer_range=config.initializer_range,
            #             max_position_embeddings=config.max_position_embeddings,
            #             dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope('decoder', reuse=True):
                masked_tgt_embed = tgt_embed * tf.expand_dims(tgt_mask, -1)
                second_dec_attn_bias = attention_bias(tf.shape(masked_tgt_embed)[1], 'cloze_bias')
                infer_decoder_input = tf.pad(masked_tgt_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
                all_att_weights, decoder_output = transformer_decoder(infer_decoder_input,
                                                                      self.enc_output,
                                                                      second_dec_attn_bias,
                                                                      self.enc_attn_bias,
                                                                      self.hps,
                                                                      scope='t_decoder')
                # [b, l_t, e] => [b*l_t, v]
                decoder_output = tf.reshape(decoder_output, [-1, hidden_size])
                second_logits = tf.matmul(decoder_output, self.decoder_weights, False, True)  # (b*l_t, v)
                vocab_probs = tf.nn.softmax(second_logits)  # [b * l_t, v]
                vocab_size = len(self.hps.vocab)
                with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                    logits = calculate_final_logits(decoder_output, all_att_weights, vocab_probs,
                                                    self.input_ids_oo, self.max_out_oovs, self.input_mask,
                                                    vocab_size, self.infer_tiled_len)  # [b * l_t, v + v']
                second_log_prob = tf.log(logits)
                # (b, l_t, v)
                extend_vocab_size = tf.add(tf.constant(vocab_size), self.max_out_oovs)
                second_log_prob = tf.reshape(second_log_prob, [-1, tf.shape(target_sequence)[1], extend_vocab_size])
                second_log_id = tf.argmax(second_log_prob, axis=-1)  # (b, l_t)
        return second_log_id
    def _build_summarization_model(self):
        is_training = self.is_training
        config = self.bert_config

        self._add_placeholders()

        '''Creates a classification model.'''
        model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_training,
            input_ids=self.input_ids,
            input_mask=self.input_mask,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=self.hps.use_tpu)  # use_one_hot_embeddings=Flags.tpu ?

        encoder_output = model.get_sequence_output()  # [b, l_s, h]

        self.sentence_rep = tf.expand_dims(model.get_pooled_output(), axis=1)  # [b, 1, h]

        self.encoder_output = encoder_output

        hidden_size = encoder_output.shape[2].value

        self.enc_attn_bias = attention_bias(self.input_mask, 'masking')

        with tf.variable_scope('bert', reuse=True):
            with tf.variable_scope('embeddings'), tf.device('/cpu:0'):
                # Perform embedding lookup on the target word ids.
                (self.out_embed, self.bert_embeddings) = embedding_lookup(
                    input_ids=self.output_ids,
                    vocab_size=config.vocab_size,
                    embedding_size=config.hidden_size,
                    initializer_range=config.initializer_range,
                    word_embedding_name='word_embeddings',
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.out_embed = embedding_postprocessor(
                    input_tensor=self.out_embed,
                    use_token_type=True,
                    token_type_ids=self.out_segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

        with tf.variable_scope('decoder'):
            self.decoder_weights = self.bert_embeddings
            self.masked_out_embed = self.out_embed * tf.expand_dims(self.output_mask, -1)
            self.decoder_input = tf.pad(self.masked_out_embed, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # Shift left
            # ################################################### decoding train - 1
            self.dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'causal')
            self.all_att_weights, self.decoder_output_1 = transformer_decoder(self.decoder_input, self.encoder_output,
                                                                              self.dec_attn_bias, self.enc_attn_bias,
                                                                              self.hps, scope='t_decoder')
            sentence_rep = tf.tile(self.sentence_rep, [1, tf.shape(self.decoder_output_1)[1], 1])  # [b, l_t, e]
            # [b, l_t, e] => [b*l_t, v]
            copy_rep_1 = tf.concat([sentence_rep, self.decoder_output_1], axis=-1)  # [b, l_t, 2 * e]
            self.decoder_output_1 = tf.reshape(self.decoder_output_1, [-1, hidden_size])
            self.vocab_logits = tf.matmul(self.decoder_output_1, self.decoder_weights, False, True)  # (b*l_t, v)
            self.vocab_probs = tf.nn.softmax(self.vocab_logits)  # [b * l_t, v]
            vocab_size = len(self.hps.vocab)
            with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                copy_rep_1 = tf.reshape(copy_rep_1, [-1, hidden_size * 2])
                self.logits = calculate_final_logits(copy_rep_1, self.all_att_weights, self.vocab_probs,
                                                     self.input_ids_oo, self.max_out_oovs, self.input_mask, vocab_size,
                                                     self.tiled_len)  # [b * l_t, v + v']
                self.pred_ids = tf.reshape(tf.argmax(self.logits, axis=-1), [self.batch_size, -1])  # [b, l_t]

            # ################################################### decoding train - 2
            self.second_dec_attn_bias = attention_bias(tf.shape(self.masked_out_embed)[1], 'cloze_bias')
            self.all_att_weights, self.decoder_output_2 = transformer_decoder(self.decoder_input, self.encoder_output,
                                                                              self.second_dec_attn_bias,
                                                                              self.enc_attn_bias,
                                                                              self.hps, scope='t_decoder', reuse=True)
            # [b, l_t, e] => [b*l_t, v]
            copy_rep_2 = tf.concat([sentence_rep, self.decoder_output_2], axis=-1)
            self.decoder_output_2 = tf.reshape(self.decoder_output_2, [-1, hidden_size])
            self.second_logits = tf.matmul(self.decoder_output_2, self.decoder_weights, False, True)  # (b*l_t, v)
            self.vocab_probs_2 = tf.nn.softmax(self.second_logits)  # [b * l_t, v]
            with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                copy_rep_2 = tf.reshape(copy_rep_2, [-1, hidden_size * 2])
                self.second_logits = calculate_final_logits(copy_rep_2, self.all_att_weights,
                                                            self.vocab_probs_2,
                                                            self.input_ids_oo, self.max_out_oovs, self.input_mask,
                                                            vocab_size,
                                                            self.tiled_len)  # [b * l_t, v + v']

            # ################################################### decoding train - 3
            self.all_att_weights, self.decoder_output_3 = transformer_decoder(self.decoder_input, self.encoder_output,
                                                                              self.sent_level_attn_bias,
                                                                              self.enc_attn_bias,
                                                                              self.hps, scope='t_decoder', reuse=True)
            # [b, l_t, e] => [b*l_t, v]
            copy_rep_3 = tf.concat([sentence_rep, self.decoder_output_3], axis=-1)
            self.decoder_output_3 = tf.reshape(self.decoder_output_3, [-1, hidden_size])
            self.third_logits = tf.matmul(self.decoder_output_3, self.decoder_weights, False, True)  # (b*l_t, v)
            self.vocab_probs_3 = tf.nn.softmax(self.third_logits)  # [b * l_t, v]
            with tf.variable_scope('copy', reuse=tf.AUTO_REUSE):
                copy_rep_3 = tf.reshape(copy_rep_3, [-1, hidden_size * 2])
                self.third_logits = calculate_final_logits(copy_rep_3, self.all_att_weights,
                                                           self.vocab_probs_3,
                                                           self.input_ids_oo, self.max_out_oovs, self.input_mask,
                                                           vocab_size,
                                                           self.tiled_len)  # [b * l_t, v + v']

        with tf.variable_scope('loss'):
            self.ce = smooth_cross_entropy(
                self.logits,
                self.output_label,
                self.hps.label_smoothing)

            self.ce = tf.reshape(self.ce, tf.shape(self.output_label))  # [b, l_t]

            mle_1 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1)  # [b]

            self.first_loss = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1)
            self.first_loss = tf.reduce_mean(self.first_loss)  # scalar

            self.second_ce = smooth_cross_entropy(
                self.second_logits,
                self.output_label,
                self.hps.label_smoothing)

            self.second_ce = tf.reshape(self.second_ce, tf.shape(self.output_label))  # [b, l_t]

            mle_2 = tf.reduce_sum(self.second_ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1)  # [b]

            self.second_loss = tf.reduce_mean(tf.reduce_sum(self.second_ce * self.output_mask, -1) / tf.reduce_sum(
                self.output_mask, -1))  # scalar

            self.ce = smooth_cross_entropy(
                self.third_logits,
                self.output_ids,
                self.hps.label_smoothing)

            self.ce = tf.reshape(self.ce, tf.shape(self.output_label))  # [b, l_t]

            mle_3 = tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(self.output_mask, -1)  # [b]

            self.third_loss = tf.reduce_mean(tf.reduce_sum(self.ce * self.output_mask, -1) / tf.reduce_sum(
                self.output_mask, -1))  # scalar

            mle = mle_1 + mle_2 + mle_3
            self.rl_loss = tf.reduce_mean(mle * self.reward)  # scalar
            self.ml_loss = self.first_loss + self.second_loss + self.third_loss
            self.loss = self.hps.rl_lambda * self.rl_loss + (1 - self.hps.rl_lambda) * self.ml_loss
            tf.summary.scalar('first_loss', self.first_loss)
            tf.summary.scalar('second_loss', self.second_loss)
            tf.summary.scalar('third_loss', self.third_loss)
            tf.summary.scalar('reward', tf.reduce_mean(self.reward))
            tf.summary.scalar('rl_loss', self.rl_loss)
            tf.summary.scalar('ml_loss', self.ml_loss)
            tf.summary.scalar('loss', self.loss)
예제 #14
0
    def _build_classify_model(self):
        is_training = self.is_training
        num_labels = self.batcher.label_num

        input_ids, input_mask, segment_ids, label_ids, gather_index1, gather_index2, input_mask_sen1, input_mask_sen2 \
            = self._add_placeholders()
        """Creates a classification model."""
        model = modeling.BertModel(config=self.bert_config,
                                   is_training=is_training,
                                   input_ids=input_ids,
                                   input_mask=input_mask,
                                   token_type_ids=segment_ids,
                                   use_one_hot_embeddings=self.hps.use_tpu
                                   )  #use_one_hot_embeddings=Flags.tpu ?

        output_layer = model.get_sequence_output()
        hidden_size = output_layer.shape[-1].value

        sentence1 = tf.gather_nd(output_layer, gather_index1)
        sentence2 = tf.gather_nd(output_layer, gather_index2)
        self.cell = tf.contrib.rnn.LSTMCell

        with tf.variable_scope('fw_matching'):
            attention_mechanism = SeqMatchSeqAttention(hidden_size, sentence1,
                                                       self.input_mask_sen1)
            m_lstm = self.cell(hidden_size)

            #if is_training:
            #    m_lstm = tf.contrib.rnn.DropoutWrapper(cell=m_lstm, input_keep_prob=0.5)

            m_lstm = SeqMatchSeqWrapper(m_lstm, attention_mechanism)
            self.ff_state_mem, self.ff_state = tf.nn.dynamic_rnn(
                m_lstm, sentence2, self.sen_length2, dtype=tf.float32)
            self.ff_hidden_state = get_hidden_state(
                self.ff_state.cell_state)  # (b*n, e_2)

        bert_hidden_output = model.get_pooled_output()
        output_layer = tf.concat([self.ff_hidden_state, bert_hidden_output],
                                 axis=-1)
        hidden_size = output_layer.shape[-1].value

        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("output_bias", [num_labels],
                                      initializer=tf.zeros_initializer())

        with tf.variable_scope("loss"):
            if is_training:
                # I.e., 0.1 dropout
                output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            one_hot_labels = tf.one_hot(label_ids,
                                        depth=num_labels,
                                        dtype=tf.float32)

            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            loss = tf.reduce_mean(per_example_loss)

        self.loss, self.per_example_loss, self.logits \
            = loss, per_example_loss, logits
        self.predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)