def call(self, inputs): source, target = inputs mask = K.random_binomial(shape=[1], p=0.5) output = mask * source + (1 - mask) * target return K.in_train_phase(output, target)
def _resource_apply_sparse(self, grad, var, indices): grad = tf.IndexedSlices(grad, indices, K.shape(var)) grad = tf.convert_to_tensor(grad) return self._resource_apply_dense(grad, var)
def _decayed_lr(self, var_dtypes): """重写获取decayed learning rate 方法""" lr_t = super(NewOptimzer, self)._decayed_lr(var_dtypes) lr_rate = piecewise_linear(self.iterations, self.lr_schedule) return lr_t * K.cast(lr_rate, var_dtypes)
def normal_shannon_entropy(p, labels_num=num_classes): # normalized entropy p = K.cast(p, K.floatx()) norm = K.log(1. / labels_num) s = K.sum(p * K.log(p), axis=-1, keepdims=True) return s / norm
def call(self, inputs): clf, x_pre, x_next = inputs uncertain = normal_shannon_entropy(clf, num_classes) cond = K.greater(self.speed, uncertain) x = K.switch(cond, x_pre, x_next) return K.in_train_phase(x_next, x)
""" import json, os import numpy as np import tensorflow as tf from toolkit4nlp.backend import keras, K from toolkit4nlp.models import build_transformer_model from toolkit4nlp.tokenizers import Tokenizer from toolkit4nlp.optimizers import Adam, extend_with_gradient_accumulation, extend_with_weight_decay from toolkit4nlp.utils import pad_sequences, DataGenerator from toolkit4nlp.layers import Layer, Dense, Permute, Input, Layer, Lambda, Dropout from toolkit4nlp.layers import AttentionPooling1D, DGCNN, SinCosPositionEmbedding from toolkit4nlp.models import Model from tqdm import tqdm K.clear_session() # 基本信息 maxlen = 512 epochs = 5 batch_size = 4 learning_rate = 2e-5 # bert配置 config_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' def load_data(filename): D = [] for d in json.load(open(filename))['data'][0]['paragraphs']:
def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) inputs = inputs - (1.0 - mask) * 1e12 return K.softmax(inputs, 1)
def call(self, inputs): maxlen = K.shape(inputs)[-1] token_emb = self.token_emb(inputs) pos = tf.range(start=0, limit=maxlen, delta=1) pos_emb = self.pos_emb(pos) return token_emb + pos_emb
def call(self, inputs, **kwargs): return K.bias_add(inputs, self.bias)
def compute_loss_of_classification(self, inputs, mask=None): _, _, y_pred, _, y_true = inputs return K.sparse_categorical_crossentropy(y_true, y_pred)
def compute_classification_acc(self, inputs, mask=None): _, _, y_pred, _, y_true = inputs equal = K.equal(K.cast(K.argmax(y_pred, axis=-1), 'int32'), K.cast(y_true, 'int32')) return K.cast(equal, K.floatx()) / K.cast( K.shape(y_true)[0], K.floatx())
if label > 0: if label % 3 == 1: starting = True entities.append([[i], id2label[(label - 1) // 3]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities] NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0]) def evaluate(data): """评测函数 """ X, Y, Z = 1e-10, 1e-10, 1e-10 for d in tqdm(data): text = ''.join([i[0] for i in d]) R = set(NER.recognize(text)) T = set([tuple(i) for i in d if i[1] != 'O']) X += len(R & T) Y += len(R) Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recall
def normal_noise(label, scale=0.1): # add normal noise to create a fake soften labels normal_noise = np.random.normal(scale=scale, size=(num_classes,)) new_label = label + normal_noise new_label = K.softmax(new_label / Temperature).numpy() return new_label
epochs=5, callbacks=[teacher_evaluator] ) # create soften labels teacher_soften.load_weights('best_teacher.weights') y_train_logits = [] y_train = [] for x, label in tqdm(train_generator): y_train_logits.append(teacher_logits.predict(x)) y_train.append(label) y_train_logits = np.concatenate(y_train_logits) y_train = np.concatenate(y_train) y_soften = K.softmax(y_train_logits / Temperature).numpy() new_y_train = np.concatenate([y_train, y_soften], axis=-1) # create normal noise fake soften labels datasets # new_data = [[d[0], d[1], normal_noise(d[1])] for d in train_data] # student_data_generator = StudentDataGenerator(new_data, batch_size) # create new datasets new_data = [[d[0], d[1], y_soften[i].tolist()] for i, d in enumerate(train_data)] student_data_generator = StudentDataGenerator(new_data, batch_size) # check soften labels accuracy if_correct = [np.array(d[1]).argmax() == np.array(d[2]).argmax() for d in new_data] correct = [t for t in if_correct if t] print('soften labels acc is: ', float(len(correct)) / len(if_correct))
def build_transformer_model_with_mlm(): """带mlm的bert模型 """ bert = build_transformer_model( config_path, with_mlm='linear', # with_nsp=True, model='bert', return_keras_model=False, # keep_tokens=keep_tokens ) proba = bert.model.output # print(proba) # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=K.floatx(), name='is_masked') # mask标记 # nsp_label = Input(shape=(None, ), dtype='int64', name='nsp') # nsp def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def nsp_loss(inputs): """计算nsp loss的函数,需要封装为一个层 """ y_true, y_pred = inputs # y_pred, _ = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.mean(loss) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) # nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba]) # nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba]) train_model = Model(bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), # 'nsp_loss': lambda y_true, y_pred: y_pred, # 'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss
def compute_output_shape(self, input_shape): if self._mode == 'embedding': return super(Embedding, self).compute_output_shape(input_shape) return input_shape[:2] + (K.int_shape(self.embeddings)[0], )
if self.lr_multiplier != 1: return self._kernel * self.lr_multiplier return self._kernel def call(self, inputs): return super(ScaleDense, self).call(inputs) # 加载预训练模型(12层) predecessor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classifier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) predecessor_model.summary() # predecessor_model_3 output = predecessor_model.layers[31].output # 第3层transform output = Lambda(lambda x: x[:, 0])(output)
def call(self, inputs): relative_position_idx = self.compute_position_idx(inputs) return K.gather(self.embeddings, relative_position_idx)
def call(self, x): seq, vec = x vec = K.expand_dims(vec, 1) vec = K.tile(vec, [1, K.shape(seq)[1], 1]) return K.concatenate([seq, vec], 2)
def call(self, inputs): # PE_2i(p) = sin(p/10000^(2i/d_pos)) # PE_2i+1(p) = cos(p/10000^(2i/d_pos)) batch_size, seq_len, word_emb_dim = K.shape(inputs)[0], K.shape( inputs)[1], K.shape(inputs)[2] if not self.embedding_dim or self.method == 'add': self.embedding_dim = word_emb_dim t = 2 * K.arange(self.embedding_dim / 2, dtype='float32') / K.cast( self.embedding_dim, dtype='float32') embedding_wise_pos = 1. / K.pow( 10000., t) # 1/10000 ^(2i/d_pos) , shape = (p_dim/2, ) embedding_wise_pos = K.expand_dims(embedding_wise_pos, 0) # (1, p_dim/2) word_wise_pos = K.cumsum(K.ones_like(inputs[:, :, 0]), axis=1) # shape = [batch_size, seq_len] word_wise_pos = K.expand_dims(word_wise_pos, 2) # (batch_size, seq_len, 1) position_embedding = K.dot( word_wise_pos, embedding_wise_pos) # (batch_size, seq_len, p_dim/2) position_embedding = K.expand_dims(position_embedding, 3) position_embedding = K.reshape(K.concatenate( [K.sin(position_embedding), K.cos(position_embedding)], axis=-1), shape=(batch_size, seq_len, -1)) if self.method == 'add': return inputs + position_embedding return K.concatenate([inputs, position_embedding], axis=-1)
train_generator = data_generator(data=train_data, batch_size=batch_size) valid_generator = data_generator(data=valid_data, batch_size=batch_size) train_transfer_generator = data_generator(data=train_data, batch_size=batch_size, transfer=True, data_augmentation=True) # 加载预训练模型(3层) teacher = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=num_hidden_layers, model='bert') # 判别模型 x_in = Input(shape=K.int_shape(teacher.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) teacher_model = Model(teacher.inputs, classifier(teacher.output)) teacher_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) teacher_model.summary() class FastbertClassifierLayer(Layer):
def call(self, inputs, mask=None, a_mask=None, position_bias=None): """ 多头注意力 :param inputs: [q, k, v, a_mask, position_bias] :param mask: [q_mask, v_mask], q_mask 对query序列进行mask,针对padding;v_mask对value序列进行mask,防止看到某些位置value,如padding :param a_mask: Boolean,是否对attention进行mask :param position_bias: type of position bias, 使用指定类型的位置编码对attention里的位置进行偏移 :return: """ q, k, v = inputs[:3] q_mask, v_mask, idx = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask is not None: a_mask = inputs[idx] idx += 1 # 投影变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, [-1, K.shape(q)[1], self.head_nums, self.key_size]) kw = K.reshape(kw, [-1, K.shape(k)[1], self.head_nums, self.key_size]) vw = K.reshape(vw, [-1, K.shape(v)[1], self.head_nums, self.head_size]) # 计算attention att = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if position_bias == 'relative': position_embeddings = inputs[idx] att = att + tf.einsum('bjhd,jkd->bhjk', qw, position_embeddings) if self.attention_scale: att = att / self.key_size**0.5 # value mask att = sequence_masking(att, v_mask, 'add', -1) # attention mask if a_mask is not None: att = att - (1 - a_mask) * 1e12 att = K.softmax(att) output = tf.einsum('bhjk,bkhd->bjhd', att, vw) # 继续处理位置编码 if position_bias == 'relative': output = output + tf.einsum('bhjk,jkd->bjhd', att, position_embeddings) output = K.reshape(output, (-1, K.shape(output)[1], self.output_dim)) output = self.combine_dense(output) # query mask output = sequence_masking(output, q_mask, 'mul') return output
def __init__(self, speed=0.1, *args, **kwargs): super(SwitchTwo, self).__init__(*args, **kwargs) self.supports_masking = True self.speed = K.constant(speed, dtype=float)
def call(self, inputs, mask=None): # 只是计算loss,并不改变输入 if mask is not None: mask = K.cast(mask, K.floatx()) return sequence_masking(inputs, mask, 1, 1)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params) ] vs = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params) ] if self.amsgrad: vhats = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params) ] else: vhats = [ K.zeros(1, name='vhat_' + str(i)) for i in range(len(params)) ] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
def dense_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是one hot形式 """ y_true = K.argmax(y_true, 2) return self.sparse_accuracy(y_true, y_pred)
def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x)
def sparse_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 逐标签取最大来粗略评测训练效果 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) return K.sum(isequal * mask) / K.sum(mask)
mapping = tokenizer.rematch(data, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segs = [0] * len(token_ids) pre = model.predict([[token_ids], [segs]])[0] labels = self.decode(pre) words = [] for i, label in enumerate(labels[1:-1]): if label < 2 or len(words) == 0: words.append([i + 1]) else: words[-1].append(i + 1) return [data[mapping[w[0]][0]:mapping[w[-1]][-1] + 1] for w in words] wordseg = WordSeg(trans=K.eval(CRF.trans), starts=[0], ends=[0]) def evaluate(data): """简单评测""" total, right = 1e-10, 1e-10 for true in tqdm(data): pre = wordseg.segment(''.join(true)) w_pre = set(pre) w_true = set(true) total += len(w_true) right += len(w_pre & w_true) return right / total
def call(self, inputs): source, target = inputs source = source * self.proportion target = target * (1 - self.proportion) output = (source + target) / 2 return K.in_train_phase(output, target)