예제 #1
0
    def call(self, inputs, mask=None, a_mask=None, position_bias=None):
        """
        多头注意力
        :param inputs: [q, k, v, a_mask, position_bias]
        :param mask: [q_mask, v_mask],
            q_mask 对query序列进行mask,针对padding;v_mask对value序列进行mask,防止看到某些位置value,如padding
        :param a_mask: Boolean,是否对attention进行mask
        :param position_bias: type of position bias, 使用指定类型的位置编码对attention里的位置进行偏移
        :return:
        """
        q, k, v = inputs[:3]
        q_mask, v_mask, idx = None, None, 3
        if mask is not None:
            if mask[0] is not None:
                q_mask = K.cast(mask[0], K.floatx())
            if mask[2] is not None:
                v_mask = K.cast(mask[2], K.floatx())
        if a_mask is not None:
            a_mask = inputs[idx]
            idx += 1

        # 投影变换
        qw = self.q_dense(q)
        kw = self.k_dense(k)
        vw = self.v_dense(v)

        # 形状变换
        qw = K.reshape(qw, [-1, K.shape(q)[1], self.head_nums, self.key_size])
        kw = K.reshape(kw, [-1, K.shape(k)[1], self.head_nums, self.key_size])
        vw = K.reshape(vw, [-1, K.shape(v)[1], self.head_nums, self.head_size])
        # 计算attention
        att = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
        # 处理位置编码
        if position_bias == 'relative':
            position_embeddings = inputs[idx]
            att = att + tf.einsum('bjhd,jkd->bhjk', qw, position_embeddings)

        if self.attention_scale:
            att = att / self.key_size**0.5

        # value mask
        att = sequence_masking(att, v_mask, 'add', -1)
        # attention mask
        if a_mask is not None:
            att = att - (1 - a_mask) * 1e12

        att = K.softmax(att)
        output = tf.einsum('bhjk,bkhd->bjhd', att, vw)
        # 继续处理位置编码
        if position_bias == 'relative':
            output = output + tf.einsum('bhjk,jkd->bjhd', att,
                                        position_embeddings)
        output = K.reshape(output, (-1, K.shape(output)[1], self.output_dim))
        output = self.combine_dense(output)
        # query mask
        output = sequence_masking(output, q_mask, 'mul')
        return output
예제 #2
0
 def sparse_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 逐标签取最大来粗略评测训练效果
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     return K.sum(isequal * mask) / K.sum(mask)
예제 #3
0
 def get_labels_of_similarity(self, inputs):
     idx = K.arange(0, K.shape(inputs)[0])
     idx_1 = idx[None, :]
     idx_2 = (idx + 1 - idx % 2 * 2)[:, None]
     labels = K.equal(idx_1, idx_2)
     labels = K.cast(labels, K.floatx())
     return labels
예제 #4
0
def sparse_accuracy(y_true, y_pred):
    # y_true需要重新明确一下shape和dtype
    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
    y_true = K.cast(y_true, 'int32')
    # 计算准确率
    y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32')
    return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx()))
예제 #5
0
        def get_updates(self, loss, params):
            # 是否更新
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            cond = K.cast(cond, K.floatx())
            # 获取梯度
            grads = self.get_gradients(loss, params)
            self.accum_grads = [
                K.zeros(shape=K.int_shape(p),
                        dtype=K.dtype(p),
                        name='accum_grad_{}'.format(i))
                for i, p in enumerate(params)
            ]

            old_update = K.update

            def new_update(x, new_x):
                new_x = cond * new_x + (1 - cond) * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(NewOptimizer, self).get_updates(loss, params)
            K.update = old_update

            # 累计更新
            with K.control_dependencies(updates):
                acc_updates = [
                    K.update(ag, g + (1 - cond) * ag)
                    for ag, g in zip(self.accum_grads, grads)
                ]

            return acc_updates
 def get_labels_of_similarity(self, y_pred):
     idxs = K.arange(0, K.shape(y_pred)[0])
     idxs_1 = idxs[None, :]
     idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
     labels = K.equal(idxs_1, idxs_2)
     labels = K.cast(labels, K.floatx())
     return labels
예제 #7
0
        def parse_func(serialized_record):
            feature_description = {
                'token_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
                'mask_ids': tf.io.FixedLenFeature([seq_length], tf.int64)
            }
            features = tf.io.parse_single_example(serialized_record,
                                                  feature_description)
            token_ids = features['token_ids']
            mask_ids = features['mask_ids']
            segment_ids = K.zeros_like(token_ids, dtype='int64')
            is_masked = K.not_equal(mask_ids, 0)
            masked_token_ids = K.switch(mask_ids, mask_ids - 1,
                                        token_ids)  # 之前让位给unmask_id一位,现在减1回归

            x = {
                'Input-Token': masked_token_ids,
                'Input-Segment': segment_ids,
                'token_ids': token_ids,
                'is_masked': K.cast(is_masked, K.floatx())
            }
            y = {
                'mlm_loss': K.zeros_like([1], tf.float32),
                'mlm_acc': K.zeros_like([1], tf.float32)
            }
            return x, y
예제 #8
0
 def compute_position_ids(self, inputs):
     """T5的相对位置分桶(直接翻译自官方T5源码)
     i-i:   0 1 2 3 4 5 6 7 8 9 10 11 12 13 14...
     f(i-j):0 1 2 3 4 5 6 7 8 8 8  8  9   9  9 ...
     """
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     num_buckets, max_distance = self.input_dim, self.max_distance
     ret = 0
     n = -pos_ids
     if self.bidirectional:
         num_buckets //= 2
         ret += K.cast(K.less(n, 0), 'int32') * num_buckets
         n = K.abs(n)
     else:
         n = K.maximum(n, 0)
     # now n is in the range [0, inf)
     max_exact = num_buckets // 2
     is_small = K.less(n, max_exact)
     val_if_large = max_exact + K.cast(
         K.log(K.cast(n, K.floatx()) / max_exact) /
         np.log(max_distance / max_exact) * (num_buckets - max_exact),
         'int32',
     )
     val_if_large = K.minimum(val_if_large, num_buckets - 1)
     ret += K.switch(is_small, n, val_if_large)
     return ret
예제 #9
0
 def get_label_mask(self, y_true):
     """获取batch内相同label样本"""
     label = K.cast(y_true, 'int32')
     label_2 = K.reshape(label, (1, -1))
     mask = K.equal(label_2, label)
     mask = K.cast(mask, K.floatx())
     mask = mask * (1 - K.eye(K.shape(y_true)[0]))  # 排除对角线,即 i == j
     return mask
예제 #10
0
 def mlm_acc(inputs):
     """计算准确率的函数,需要封装为一个层
     """
     y_true, y_pred, mask = inputs
     #         _, y_pred = y_pred
     y_true = K.cast(y_true, K.floatx())
     acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
     acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
     return acc
 def compute_loss(self, inputs, mask=None):
     y_true, y_pred = inputs
     y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
     accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
     accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask)
     self.add_metric(accuracy, name='accuracy')
     loss = K.sparse_categorical_crossentropy(y_true, y_pred)
     loss = K.sum(loss * y_mask) / K.sum(y_mask)
     return loss
예제 #12
0
 def call(self, x, mask=None):
     x0 = x
     x = self.k_dense(x0)
     x = self.o_dense(x)
     if mask is not None:
         mask = K.cast(mask, K.floatx())
         mask = K.expand_dims(mask, 2)
         x = x - (1 - mask) * 1e12
     x = K.softmax(x, 1)
     x = K.sum(x0 * x, 1)
     return x
예제 #13
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i))
            for (i, p) in enumerate(params)
        ]
        vs = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i))
            for (i, p) in enumerate(params)
        ]

        if self.amsgrad:
            vhats = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='vhat_' + str(i)) for (i, p) in enumerate(params)
            ]
        else:
            vhats = [
                K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))
            ]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #14
0
    def call(self, x, mask=None):
        x0 = x
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, 2)
        #         x = x0 * mask if mask is not None else x0
        x0 = Lambda(lambda x_: x_, output_shape=lambda s: s)(x0)  # drop mask so do not put mask to conv1d
        x = self.conv1d(x0)
        x, g = x[:, :, :self.o_dim], x[:, :, self.o_dim:]
        if self.dropout_rate is not None:
            g = K.in_train_phase(K.dropout(g, self.dropout_rate), g)
        g = K.sigmoid(g)
        # mask is none
        mask = mask if mask is not None else K.ones_like(x)

        if self.skip_connection:
            if K.int_shape(x0)[-1] != self.o_dim:
                x0 = self.conv1d_1x1(x0)
            return (x0 * (1 - g) + x * g) * mask
        return x * g * mask
예제 #15
0
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True)
     mask = K.cast(mask, K.floatx())
     # 计算目标分数
     y_true, y_pred = y_true * mask, y_pred * mask
     target_score = self.path_score(y_pred, y_true)
     # 递归计算log Z
     init_states = [y_pred[:, 0]]
     y_pred = K.concatenate([y_pred, mask], axis=2)
     input_length = K.int_shape(y_pred[:, 1:])[1]
     log_norm, _, _ = K.rnn(self.log_norm_step,
                            y_pred[:, 1:],
                            init_states,
                            input_length=input_length)  # 最后一步的log Z向量
     log_norm = K.logsumexp(log_norm, 1)  # logsumexp得标量
     # 计算损失 -log p
     return log_norm - target_score
예제 #16
0
def build_transformer_model_with_mlm():
    """带mlm的bert模型
    """
    bert = build_transformer_model(
        config_path,
        with_mlm='linear',
        #         with_nsp=True,
        model='bert',
        return_keras_model=False,
        #         keep_tokens=keep_tokens
    )
    proba = bert.model.output
    #     print(proba)
    # 辅助输入
    token_ids = Input(shape=(None, ), dtype='int64', name='token_ids')  # 目标id
    is_masked = Input(shape=(None, ), dtype=K.floatx(),
                      name='is_masked')  # mask标记

    #     nsp_label = Input(shape=(None, ), dtype='int64', name='nsp')  # nsp

    def mlm_loss(inputs):
        """计算loss的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        #         _, y_pred = y_pred
        loss = K.sparse_categorical_crossentropy(y_true,
                                                 y_pred,
                                                 from_logits=True)
        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
        return loss

    def nsp_loss(inputs):
        """计算nsp loss的函数,需要封装为一个层
        """
        y_true, y_pred = inputs
        #         y_pred, _ = y_pred
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.mean(loss)
        return loss

    def mlm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        #         _, y_pred = y_pred
        y_true = K.cast(y_true, K.floatx())
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
        return acc

    def nsp_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred = inputs
        y_pred, _ = y_pred
        y_true = K.cast(y_true, K.floatx)
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.mean(acc)
        return acc

    mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
    mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])
    #     nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba])
    #     nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba])

    train_model = Model(bert.model.inputs + [token_ids, is_masked],
                        [mlm_loss, mlm_acc])

    loss = {
        'mlm_loss': lambda y_true, y_pred: y_pred,
        'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
        #         'nsp_loss': lambda y_true, y_pred: y_pred,
        #         'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
    }

    return bert, train_model, loss
예제 #17
0
def build_transformer_model_with_mlm(version='pre'):
    """带mlm的bert模型
    """
    assert version in ['pre', 'post', 'rezero']
    if version == 'rezero':
        attention_name = 'Transformer-%d-MultiHeadSelfAttention'
        feed_forward_name = 'Transformer-%d-FeedForward'
        skip_weights = []
        for i in range(12):
            skip_weights.append(feed_forward_name % i + '-Norm')
            skip_weights.append(feed_forward_name % i + '-ReWeight')
            skip_weights.append(attention_name % i + '-Norm')
            skip_weights.append(attention_name % i + '-ReWeight')

        bert = build_transformer_model(
            config_path,
            with_mlm='linear',
            model='rezero',
            return_keras_model=False,
            skip_weights_from_checkpoints=skip_weights,
            use_layernorm=None,
            reweight_trainable=True,
            init_reweight=0.,
        )
    else:
        bert = build_transformer_model(
            config_path,
            with_mlm='linear',
            model='rezero',
            return_keras_model=False,
            #         skip_weights_from_checkpoints=skip_weights,
            use_layernorm=version,
            reweight_trainable=False,
            init_reweight=1.,
        )

    proba = bert.model.output
    #     print(proba)
    # 辅助输入
    token_ids = Input(shape=(None, ), dtype='int64', name='token_ids')  # 目标id
    is_masked = Input(shape=(None, ), dtype=K.floatx(),
                      name='is_masked')  # mask标记

    #     nsp_label = Input(shape=(None, ), dtype='int64', name='nsp')  # nsp

    def mlm_loss(inputs):
        """计算loss的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        #         _, y_pred = y_pred
        loss = K.sparse_categorical_crossentropy(y_true,
                                                 y_pred,
                                                 from_logits=True)
        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
        return loss

    def nsp_loss(inputs):
        """计算nsp loss的函数,需要封装为一个层
        """
        y_true, y_pred = inputs
        #         y_pred, _ = y_pred
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.mean(loss)
        return loss

    def mlm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        #         _, y_pred = y_pred
        y_true = K.cast(y_true, K.floatx())
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
        return acc

    def nsp_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred = inputs
        y_pred, _ = y_pred
        y_true = K.cast(y_true, K.floatx)
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.mean(acc)
        return acc

    mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
    mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])
    #     nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba])
    #     nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba])

    train_model = Model(bert.model.inputs + [token_ids, is_masked],
                        [mlm_loss, mlm_acc])

    loss = {
        'mlm_loss': lambda y_true, y_pred: y_pred,
        'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
        #         'nsp_loss': lambda y_true, y_pred: y_pred,
        #         'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
    }

    return bert, train_model, loss
예제 #18
0
def normal_shannon_entropy(p, labels_num=num_classes):
    # normalized entropy
    p = K.cast(p, K.floatx())
    norm = K.log(1. / labels_num)
    s = K.sum(p * K.log(p), axis=-1, keepdims=True)
    return s / norm
예제 #19
0
import os

os.environ['TF_KERAS'] = '1'  # 必须使用tf.keras

import glob

import tensorflow as tf
from toolkit4nlp.backend import K, keras
from toolkit4nlp.models import build_transformer_model
from toolkit4nlp.optimizers import Adam, extend_with_gradient_accumulation, extend_with_wight_decay
from toolkit4nlp.optimizers import extend_with_piecewise_linear_lr
from keras.models import Model
from toolkit4nlp.layers import Input, Lambda
from preprocess import TrainingDataSetRoBERTa

floatx = K.floatx()

config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json'
ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt'
vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt'

model_save_path = '../saved_model/bert_model.ckpt'

file_names = glob.glob('../corpus_record/*')

seq_length = 512
batch_size = 8
learning_rate = 0.00176
weight_decay_rate = 0.01
num_warmup_steps = 3125
num_train_steps = 125000
 def compute_classification_acc(self, inputs, mask=None):
     _, _, y_pred, _, y_true = inputs
     equal = K.equal(K.cast(K.argmax(y_pred, axis=-1), 'int32'),
                     K.cast(y_true, 'int32'))
     return K.cast(equal, K.floatx()) / K.cast(
         K.shape(y_true)[0], K.floatx())
예제 #21
0
    def call(self, inputs, mask=None):
        # 只是计算loss,并不改变输入
        if mask is not None:
            mask = K.cast(mask, K.floatx())

        return sequence_masking(inputs, mask, 1, 1)
예제 #22
0
 def call(self, inputs, mask=None):
     if mask is not None:
         mask = K.cast(mask, K.floatx())
         mask = K.expand_dims(mask, 2)
         inputs = inputs - (1.0 - mask) * 1e12
     return K.softmax(inputs, 1)