Пример #1
0
 def call(self, inputs):
     source, target = inputs
     mask = K.random_binomial(shape=[1], p=0.5)
     output = mask * source + (1 - mask) * target
     return K.in_train_phase(output, target)
Пример #2
0
 def _resource_apply_sparse(self, grad, var, indices):
     grad = tf.IndexedSlices(grad, indices, K.shape(var))
     grad = tf.convert_to_tensor(grad)
     return self._resource_apply_dense(grad, var)
Пример #3
0
        def _decayed_lr(self, var_dtypes):
            """重写获取decayed learning rate 方法"""

            lr_t = super(NewOptimzer, self)._decayed_lr(var_dtypes)
            lr_rate = piecewise_linear(self.iterations, self.lr_schedule)
            return lr_t * K.cast(lr_rate, var_dtypes)
def normal_shannon_entropy(p, labels_num=num_classes):
    # normalized entropy
    p = K.cast(p, K.floatx())
    norm = K.log(1. / labels_num)
    s = K.sum(p * K.log(p), axis=-1, keepdims=True)
    return s / norm
 def call(self, inputs):
     clf, x_pre, x_next = inputs
     uncertain = normal_shannon_entropy(clf, num_classes)
     cond = K.greater(self.speed, uncertain)
     x = K.switch(cond, x_pre, x_next)
     return K.in_train_phase(x_next, x)
Пример #6
0
"""
import json, os

import numpy as np
import tensorflow as tf
from toolkit4nlp.backend import keras, K
from toolkit4nlp.models import build_transformer_model
from toolkit4nlp.tokenizers import Tokenizer
from toolkit4nlp.optimizers import Adam, extend_with_gradient_accumulation, extend_with_weight_decay
from toolkit4nlp.utils import pad_sequences, DataGenerator
from toolkit4nlp.layers import Layer, Dense, Permute, Input, Layer, Lambda, Dropout
from toolkit4nlp.layers import AttentionPooling1D, DGCNN, SinCosPositionEmbedding
from toolkit4nlp.models import Model
from tqdm import tqdm

K.clear_session()
# 基本信息
maxlen = 512
epochs = 5
batch_size = 4
learning_rate = 2e-5

# bert配置
config_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt'


def load_data(filename):
    D = []
    for d in json.load(open(filename))['data'][0]['paragraphs']:
Пример #7
0
 def call(self, inputs, mask=None):
     if mask is not None:
         mask = K.cast(mask, K.floatx())
         mask = K.expand_dims(mask, 2)
         inputs = inputs - (1.0 - mask) * 1e12
     return K.softmax(inputs, 1)
Пример #8
0
 def call(self, inputs):
     maxlen = K.shape(inputs)[-1]
     token_emb = self.token_emb(inputs)
     pos = tf.range(start=0, limit=maxlen, delta=1)
     pos_emb = self.pos_emb(pos)
     return token_emb + pos_emb
Пример #9
0
 def call(self, inputs, **kwargs):
     return K.bias_add(inputs, self.bias)
 def compute_loss_of_classification(self, inputs, mask=None):
     _, _, y_pred, _, y_true = inputs
     return K.sparse_categorical_crossentropy(y_true, y_pred)
 def compute_classification_acc(self, inputs, mask=None):
     _, _, y_pred, _, y_true = inputs
     equal = K.equal(K.cast(K.argmax(y_pred, axis=-1), 'int32'),
                     K.cast(y_true, 'int32'))
     return K.cast(equal, K.floatx()) / K.cast(
         K.shape(y_true)[0], K.floatx())
            if label > 0:
                if label % 3 == 1:
                    starting = True
                    entities.append([[i], id2label[(label - 1) // 3]])
                elif starting:
                    entities[-1][0].append(i)
                else:
                    starting = False
            else:
                starting = False

        return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)
                for w, l in entities]


NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])


def evaluate(data):
    """评测函数
    """
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for d in tqdm(data):
        text = ''.join([i[0] for i in d])
        R = set(NER.recognize(text))
        T = set([tuple(i) for i in d if i[1] != 'O'])
        X += len(R & T)
        Y += len(R)
        Z += len(T)
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall
def normal_noise(label, scale=0.1):
    # add normal noise to create a fake soften labels
    normal_noise = np.random.normal(scale=scale, size=(num_classes,))
    new_label = label + normal_noise
    new_label = K.softmax(new_label / Temperature).numpy()
    return new_label
        epochs=5,
        callbacks=[teacher_evaluator]
    )

    # create soften labels
    teacher_soften.load_weights('best_teacher.weights')

    y_train_logits = []
    y_train = []
    for x, label in tqdm(train_generator):
        y_train_logits.append(teacher_logits.predict(x))
        y_train.append(label)

    y_train_logits = np.concatenate(y_train_logits)
    y_train = np.concatenate(y_train)
    y_soften = K.softmax(y_train_logits / Temperature).numpy()
    new_y_train = np.concatenate([y_train, y_soften], axis=-1)

    # create normal noise fake soften labels datasets
    # new_data = [[d[0], d[1], normal_noise(d[1])] for d in train_data]
    # student_data_generator = StudentDataGenerator(new_data, batch_size)

    # create new datasets
    new_data = [[d[0], d[1], y_soften[i].tolist()] for i, d in enumerate(train_data)]
    student_data_generator = StudentDataGenerator(new_data, batch_size)

    # check soften labels accuracy
    if_correct = [np.array(d[1]).argmax() == np.array(d[2]).argmax() for d in new_data]
    correct = [t for t in if_correct if t]
    print('soften labels acc is: ', float(len(correct)) / len(if_correct))
Пример #15
0
def build_transformer_model_with_mlm():
    """带mlm的bert模型
    """
    bert = build_transformer_model(
        config_path,
        with_mlm='linear',
        #         with_nsp=True,
        model='bert',
        return_keras_model=False,
        #         keep_tokens=keep_tokens
    )
    proba = bert.model.output
    #     print(proba)
    # 辅助输入
    token_ids = Input(shape=(None, ), dtype='int64', name='token_ids')  # 目标id
    is_masked = Input(shape=(None, ), dtype=K.floatx(),
                      name='is_masked')  # mask标记

    #     nsp_label = Input(shape=(None, ), dtype='int64', name='nsp')  # nsp

    def mlm_loss(inputs):
        """计算loss的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        #         _, y_pred = y_pred
        loss = K.sparse_categorical_crossentropy(y_true,
                                                 y_pred,
                                                 from_logits=True)
        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
        return loss

    def nsp_loss(inputs):
        """计算nsp loss的函数,需要封装为一个层
        """
        y_true, y_pred = inputs
        #         y_pred, _ = y_pred
        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
        loss = K.mean(loss)
        return loss

    def mlm_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred, mask = inputs
        #         _, y_pred = y_pred
        y_true = K.cast(y_true, K.floatx())
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
        return acc

    def nsp_acc(inputs):
        """计算准确率的函数,需要封装为一个层
        """
        y_true, y_pred = inputs
        y_pred, _ = y_pred
        y_true = K.cast(y_true, K.floatx)
        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
        acc = K.mean(acc)
        return acc

    mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
    mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])
    #     nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba])
    #     nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba])

    train_model = Model(bert.model.inputs + [token_ids, is_masked],
                        [mlm_loss, mlm_acc])

    loss = {
        'mlm_loss': lambda y_true, y_pred: y_pred,
        'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
        #         'nsp_loss': lambda y_true, y_pred: y_pred,
        #         'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
    }

    return bert, train_model, loss
Пример #16
0
    def compute_output_shape(self, input_shape):
        if self._mode == 'embedding':
            return super(Embedding, self).compute_output_shape(input_shape)

        return input_shape[:2] + (K.int_shape(self.embeddings)[0], )
Пример #17
0
        if self.lr_multiplier != 1:
            return self._kernel * self.lr_multiplier
        return self._kernel

    def call(self, inputs):
        return super(ScaleDense, self).call(inputs)


# 加载预训练模型(12层)
predecessor = build_transformer_model(config_path=config_path,
                                      checkpoint_path=checkpoint_path,
                                      return_keras_model=False,
                                      prefix='Predecessor-')

# 判别模型
x_in = Input(shape=K.int_shape(predecessor.output)[1:])
x = Lambda(lambda x: x[:, 0])(x_in)
x = Dense(units=num_classes, activation='softmax')(x)
classifier = Model(x_in, x)

predecessor_model = Model(predecessor.inputs, classifier(predecessor.output))
predecessor_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(1e-5),  # 用足够小的学习率
    metrics=['sparse_categorical_accuracy'],
)
predecessor_model.summary()

# predecessor_model_3
output = predecessor_model.layers[31].output  # 第3层transform
output = Lambda(lambda x: x[:, 0])(output)
Пример #18
0
 def call(self, inputs):
     relative_position_idx = self.compute_position_idx(inputs)
     return K.gather(self.embeddings, relative_position_idx)
Пример #19
0
 def call(self, x):
     seq, vec = x
     vec = K.expand_dims(vec, 1)
     vec = K.tile(vec, [1, K.shape(seq)[1], 1])
     return K.concatenate([seq, vec], 2)
Пример #20
0
    def call(self, inputs):
        #     PE_2i(p) = sin(p/10000^(2i/d_pos))
        #     PE_2i+1(p) = cos(p/10000^(2i/d_pos))
        batch_size, seq_len, word_emb_dim = K.shape(inputs)[0], K.shape(
            inputs)[1], K.shape(inputs)[2]
        if not self.embedding_dim or self.method == 'add':
            self.embedding_dim = word_emb_dim
        t = 2 * K.arange(self.embedding_dim / 2, dtype='float32') / K.cast(
            self.embedding_dim, dtype='float32')
        embedding_wise_pos = 1. / K.pow(
            10000., t)  # 1/10000 ^(2i/d_pos) , shape = (p_dim/2, )
        embedding_wise_pos = K.expand_dims(embedding_wise_pos,
                                           0)  # (1, p_dim/2)
        word_wise_pos = K.cumsum(K.ones_like(inputs[:, :, 0]),
                                 axis=1)  # shape = [batch_size, seq_len]
        word_wise_pos = K.expand_dims(word_wise_pos,
                                      2)  # (batch_size, seq_len, 1)
        position_embedding = K.dot(
            word_wise_pos,
            embedding_wise_pos)  # (batch_size, seq_len, p_dim/2)

        position_embedding = K.expand_dims(position_embedding, 3)
        position_embedding = K.reshape(K.concatenate(
            [K.sin(position_embedding),
             K.cos(position_embedding)], axis=-1),
                                       shape=(batch_size, seq_len, -1))

        if self.method == 'add':
            return inputs + position_embedding

        return K.concatenate([inputs, position_embedding], axis=-1)
train_generator = data_generator(data=train_data, batch_size=batch_size)
valid_generator = data_generator(data=valid_data, batch_size=batch_size)
train_transfer_generator = data_generator(data=train_data,
                                          batch_size=batch_size,
                                          transfer=True,
                                          data_augmentation=True)

# 加载预训练模型(3层)
teacher = build_transformer_model(config_path=config_path,
                                  checkpoint_path=checkpoint_path,
                                  return_keras_model=False,
                                  num_hidden_layers=num_hidden_layers,
                                  model='bert')

# 判别模型
x_in = Input(shape=K.int_shape(teacher.output)[1:])
x = Lambda(lambda x: x[:, 0])(x_in)
x = Dense(units=num_classes, activation='softmax')(x)
classifier = Model(x_in, x)

teacher_model = Model(teacher.inputs, classifier(teacher.output))
teacher_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(2e-5),  # 用足够小的学习率
    metrics=['sparse_categorical_accuracy'],
)

teacher_model.summary()


class FastbertClassifierLayer(Layer):
Пример #22
0
    def call(self, inputs, mask=None, a_mask=None, position_bias=None):
        """
        多头注意力
        :param inputs: [q, k, v, a_mask, position_bias]
        :param mask: [q_mask, v_mask],
            q_mask 对query序列进行mask,针对padding;v_mask对value序列进行mask,防止看到某些位置value,如padding
        :param a_mask: Boolean,是否对attention进行mask
        :param position_bias: type of position bias, 使用指定类型的位置编码对attention里的位置进行偏移
        :return:
        """
        q, k, v = inputs[:3]
        q_mask, v_mask, idx = None, None, 3
        if mask is not None:
            if mask[0] is not None:
                q_mask = K.cast(mask[0], K.floatx())
            if mask[2] is not None:
                v_mask = K.cast(mask[2], K.floatx())
        if a_mask is not None:
            a_mask = inputs[idx]
            idx += 1

        # 投影变换
        qw = self.q_dense(q)
        kw = self.k_dense(k)
        vw = self.v_dense(v)

        # 形状变换
        qw = K.reshape(qw, [-1, K.shape(q)[1], self.head_nums, self.key_size])
        kw = K.reshape(kw, [-1, K.shape(k)[1], self.head_nums, self.key_size])
        vw = K.reshape(vw, [-1, K.shape(v)[1], self.head_nums, self.head_size])
        # 计算attention
        att = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
        # 处理位置编码
        if position_bias == 'relative':
            position_embeddings = inputs[idx]
            att = att + tf.einsum('bjhd,jkd->bhjk', qw, position_embeddings)

        if self.attention_scale:
            att = att / self.key_size**0.5

        # value mask
        att = sequence_masking(att, v_mask, 'add', -1)
        # attention mask
        if a_mask is not None:
            att = att - (1 - a_mask) * 1e12

        att = K.softmax(att)
        output = tf.einsum('bhjk,bkhd->bjhd', att, vw)
        # 继续处理位置编码
        if position_bias == 'relative':
            output = output + tf.einsum('bhjk,jkd->bjhd', att,
                                        position_embeddings)
        output = K.reshape(output, (-1, K.shape(output)[1], self.output_dim))
        output = self.combine_dense(output)
        # query mask
        output = sequence_masking(output, q_mask, 'mul')
        return output
 def __init__(self, speed=0.1, *args, **kwargs):
     super(SwitchTwo, self).__init__(*args, **kwargs)
     self.supports_masking = True
     self.speed = K.constant(speed, dtype=float)
Пример #24
0
    def call(self, inputs, mask=None):
        # 只是计算loss,并不改变输入
        if mask is not None:
            mask = K.cast(mask, K.floatx())

        return sequence_masking(inputs, mask, 1, 1)
Пример #25
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i))
            for (i, p) in enumerate(params)
        ]
        vs = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i))
            for (i, p) in enumerate(params)
        ]

        if self.amsgrad:
            vhats = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='vhat_' + str(i)) for (i, p) in enumerate(params)
            ]
        else:
            vhats = [
                K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))
            ]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
Пример #26
0
 def dense_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是one hot形式
     """
     y_true = K.argmax(y_true, 2)
     return self.sparse_accuracy(y_true, y_pred)
Пример #27
0
 def new_update(x, new_x):
     new_x = K.switch(cond, new_x, x)
     return old_update(x, new_x)
Пример #28
0
 def sparse_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 逐标签取最大来粗略评测训练效果
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     return K.sum(isequal * mask) / K.sum(mask)
        mapping = tokenizer.rematch(data, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segs = [0] * len(token_ids)
        pre = model.predict([[token_ids], [segs]])[0]
        labels = self.decode(pre)

        words = []
        for i, label in enumerate(labels[1:-1]):
            if label < 2 or len(words) == 0:
                words.append([i + 1])
            else:
                words[-1].append(i + 1)
        return [data[mapping[w[0]][0]:mapping[w[-1]][-1] + 1] for w in words]


wordseg = WordSeg(trans=K.eval(CRF.trans), starts=[0], ends=[0])


def evaluate(data):
    """简单评测"""
    total, right = 1e-10, 1e-10
    for true in tqdm(data):
        pre = wordseg.segment(''.join(true))
        w_pre = set(pre)
        w_true = set(true)
        total += len(w_true)
        right += len(w_pre & w_true)

    return right / total

Пример #30
0
 def call(self, inputs):
     source, target = inputs
     source = source * self.proportion
     target = target * (1 - self.proportion)
     output = (source + target) / 2
     return K.in_train_phase(output, target)