def bert_builder(): bert = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model='bert', with_mlm=True, ) return bert
np.random.shuffle(sentences) return '。'.join(sentences) # 转换数据集 train_generator = data_generator(data=train_data, batch_size=batch_size) valid_generator = data_generator(data=valid_data, batch_size=batch_size) train_transfer_generator = data_generator(data=train_data, batch_size=batch_size, transfer=True, data_augmentation=True) # 加载预训练模型(3层) teacher = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=num_hidden_layers, model='bert') # 判别模型 x_in = Input(shape=K.int_shape(teacher.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) teacher_model = Model(teacher.inputs, classifier(teacher.output)) teacher_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(2e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], )
from toolkit4nlp.tokenizers import Tokenizer from toolkit4nlp.utils import AutoRegressiveDecoder config_path = 'D:/pretrain/GPT_LCCC-base-tf/gpt_config.json' checkpoint_path = 'D:/pretrain/GPT_LCCC-base-tf/gpt_model.ckpt' dict_path = 'D:/pretrain/GPT_LCCC-base-tf/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) speakers = [ tokenizer.token_to_id('[speaker1]'), tokenizer.token_to_id('[speaker2]') ] model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model='gpt', ) model.summary() class ChatBot(AutoRegressiveDecoder): """ 随机采样生成对话 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states, rtype='probas'): token_ids, segment_ids = inputs cur_segment_ids = np.zeros_like(output_ids) + token_ids[ 0, -1] # which speaker
# warm up and global learning_rate decay lr_schedule = { num_warmup_steps * grad_accum_steps: 1.0, num_train_steps * grad_accum_steps: 0.0, } # load dataset dataset = TrainingDataSetRoBERTa.load_tfrecord(record_names=file_names, seq_length=seq_length, batch_size=batch_size) """ 使用RoBerta的方式训练,即取消NSP任务,只保留mask language model 任务 """ # build model bert = build_transformer_model(config, ckpt, with_mlm='linear', return_keras_model=False) proba = bert.model.output # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=floatx, name='is_masked') # mask标记 def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss
token_ids.append(tokenizer._token_end_id) labels.append(0) batch_tokens.append(token_ids) batch_segs.append([0] * len(token_ids)) batch_labels.append(labels) if len(batch_tokens) >= self.batch_size or is_end: batch_tokens = pad_sequences(batch_tokens) batch_segs = pad_sequences(batch_segs) batch_labels = pad_sequences(batch_labels) yield [batch_tokens, batch_segs], batch_labels batch_tokens, batch_segs, batch_labels = [], [], [] model = build_transformer_model(config_path=bert_config, checkpoint_path=bert_checkpoint) output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1) output = model.get_layer(output_layer).output output = Dense(num_labes)(output) CRF = ConditionalRandomField(lr_multi) output = CRF(output) model = Model(model.input, output) model.summary() class WordSeg(ViterbiDecoder): def segment(self, data): tokens = tokenizer.tokenize(data) while len(tokens) > 512: tokens.pop(-2) mapping = tokenizer.rematch(data, tokens)
for index in range(successor.num_hidden_layers): predecessor_outputs = outputs for sub_index in range(layers_per_module): predecessor_outputs = predecessor.apply_attention_layers( predecessor_outputs, layers_per_module * index + sub_index) successor_outputs = successor.apply_attention_layers(outputs, index) outputs = ProportionalAdd()([predecessor_outputs, successor_outputs]) # 返回模型 outputs = classifier(outputs) model = Model(inputs, outputs) return model # 加载预训练模型(12层) predecessor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 加载预训练模型(3层) successor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, num_hidden_layers=3, prefix='Successor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Dense(num_labels)(x_in) CRF = ConditionalRandomField(lr_multiplier=2) x = CRF(x) classifier = Model(x_in, x)
idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] labels = K.equal(idxs_1, idxs_2) labels = K.cast(labels, K.floatx()) return labels def compute_kld(self, inputs, alpha=4, mask=None): _, _, _, y_pred = inputs loss = kld(y_pred[::2], y_pred[1::2]) + kld(y_pred[1::2], y_pred[::2]) loss = K.mean(loss) / 4 * alpha self.add_metric(loss, 'kld') return loss bert = build_transformer_model(checkpoint_path=checkpoint_path, config_path=config_path, keep_tokens=keep_tokens, dropout_rate=0.3, ) label_inputs = Input(shape=(None,), name='label_inputs') pooler = Lambda(lambda x: x[:, 0])(bert.output) x = Dense(units=num_classes, activation='softmax', name='classifier')(pooler) output = TotalLoss(4)(bert.inputs + [label_inputs, pooler, x]) model = Model(bert.inputs + [label_inputs], output) classifier = Model(bert.inputs, x) model.compile(optimizer=Adam(2e-5), metrics=['acc']) model.summary()
return self._bias * self.lr_multiplier return self._bias @property def kernel(self): if self.lr_multiplier != 1: return self._kernel * self.lr_multiplier return self._kernel def call(self, inputs): return super(ScaleDense, self).call(inputs) # 加载预训练模型(12层) predecessor = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, prefix='Predecessor-') # 判别模型 x_in = Input(shape=K.int_shape(predecessor.output)[1:]) x = Lambda(lambda x: x[:, 0])(x_in) x = Dense(units=num_classes, activation='softmax')(x) classifier = Model(x_in, x) predecessor_model = Model(predecessor.inputs, classifier(predecessor.output)) predecessor_model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 metrics=['sparse_categorical_accuracy'], ) predecessor_model.summary()
"""在序列长度那一维进行softmax,并mask掉padding部分 """ def compute_mask(self, inputs, mask=None): return None def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask, 2) inputs = inputs - (1.0 - mask) * 1e12 return K.softmax(inputs, 1) # build model model = build_transformer_model( config_path, checkpoint_path, ) inputs = [ Input(shape=K.int_shape(model.inputs[0])[1:]), Input(shape=K.int_shape(model.inputs[1])[1:]) ] output = model(inputs) output = SinCosPositionEmbedding(K.int_shape(output)[-1])(output) output = Dropout(0.5)(output) output = Dense(384, activation='tanh')(output) att = AttentionPooling1D(name='attention_pooling_1')(output) output = ConcatSeq2Vec()([output, att])
def compute_loss_of_classification(self, inputs, mask=None): _, _, y_pred, _, y_true = inputs return K.sparse_categorical_crossentropy(y_true, y_pred) def compute_classification_acc(self, inputs, mask=None): _, _, y_pred, _, y_true = inputs equal = K.equal(K.cast(K.argmax(y_pred, axis=-1), 'int32'), K.cast(y_true, 'int32')) return K.cast(equal, K.floatx()) / K.cast( K.shape(y_true)[0], K.floatx()) bert = build_transformer_model(checkpoint_path=checkpoint_path, config_path=config_path, with_pool='linear', application='unilm', keep_tokens=keep_tokens, return_keras_model=False) label_inputs = Input(shape=(None, ), name='label_inputs') pooler = bert.model.outputs[0] classification_output = Dense(units=num_classes, activation='softmax', name='classifier')(pooler) classifier = Model(bert.model.inputs, classification_output) seq2seq = Model(bert.model.inputs, bert.model.outputs[1]) outputs = TotalLoss([2])(bert.model.inputs + bert.model.outputs) # outputs = Dense(num_classes, activation='softmax')(outputs) train_model = Model(bert.model.inputs, [classification_output, outputs])
def build_transformer_model_with_mlm(): """带mlm的bert模型 """ bert = build_transformer_model( config_path, with_mlm='linear', # with_nsp=True, model='bert', return_keras_model=False, # keep_tokens=keep_tokens ) proba = bert.model.output # print(proba) # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=K.floatx(), name='is_masked') # mask标记 # nsp_label = Input(shape=(None, ), dtype='int64', name='nsp') # nsp def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def nsp_loss(inputs): """计算nsp loss的函数,需要封装为一个层 """ y_true, y_pred = inputs # y_pred, _ = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.mean(loss) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) # nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba]) # nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba]) train_model = Model(bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), # 'nsp_loss': lambda y_true, y_pred: y_pred, # 'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss
self.valid_model = self.valid_model or self.model val_acc = evaluate(valid_generator, self.valid_model) if val_acc > self.best_val_acc: self.best_val_acc = val_acc self.model.save_weights(self.savename) print( u'val_acc: %.5f, best_val_acc: %.5f\n' % (val_acc, self.best_val_acc) ) # teacher model(12层) teacher = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=True, num_hidden_layers=12, prefix='Teacher-' ) output = Lambda(lambda x: x[:, 0])(teacher.output) logits = Dense(num_classes)(output) soften = Activation(activation='softmax')(logits) teacher_logits = Model(teacher.inputs, logits) teacher_soften = Model(teacher.inputs, soften) teacher_soften.compile(loss='categorical_crossentropy', optimizer=Adam(2e-5), metrics=['acc']) teacher_soften.summary() class StudentDataGenerator(DataGenerator): """数据生成器 """
if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = pad_sequences(batch_token_ids) batch_segment_ids = pad_sequences(batch_segment_ids) batch_labels = pad_sequences(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], [] train_generator = data_generator(data=train_data, batch_size=batch_size) val_generator = data_generator(valid_data, batch_size) # build model bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, num_hidden_layers=num_hidden_layers) output = Lambda(lambda x: x[:, 0])(bert.output) output = Dense(num_classes, activation='softmax')(output) model = Model(bert.inputs, output) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr), metrics=['acc']) def evaluate(data): total, right = 0., 0. for x_true, y_true in tqdm(data):
# -*- coding: utf-8 -*- # @Date : 2020/7/16 # @Author : mingming.xu # @Email : [email protected] # @File : mask_language_model.py import numpy as np from toolkit4nlp.tokenizers import Tokenizer from toolkit4nlp.models import build_transformer_model config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(vocab, do_lower_case=True) model = build_transformer_model(config, checkpoint_path=ckpt, with_mlm=True) # tokens, segs = tokenizer.encode('北京网聘技术有限公司') tokens, segs = tokenizer.encode('科学技术是第一生产力') tokens[3] = tokens[4] = tokenizer._token_dict['[MASK]'] prob = model.predict([np.array([tokens]), np.array([segs])])[0] print(tokenizer.decode(np.argmax(prob[3:5], axis=1))) ''' 正确结果应该是: 技术 '''
# -*- coding: utf-8 -*- # @Date : 2020/7/15 # @Author : mingming.xu # @Email : [email protected] # @File : extract_feature.py from toolkit4nlp.models import build_transformer_model from toolkit4nlp.tokenizers import Tokenizer import numpy as np config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(vocab, do_lower_case=True) model = build_transformer_model(config, checkpoint_path=ckpt) token_ids, segment_ids = tokenizer.encode(u'我爱你中国') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) '''[[[-0.00827767 0.52711666 -0.2616654 ... 0.7717162 0.6682844 -0.3481327 ] [ 0.3665638 0.35970846 0.0772187 ... -0.5211092 -0.46724823 0.07845997] [ 0.6985213 -0.04391993 -1.3160559 ... 1.061864 0.8293197 0.07258661] ... [ 0.25169933 0.3048255 -1.2513847 ... 0.5438095 0.46753633 -0.61883307] [ 0.07904327 -0.08373377 -0.3963912 ... 0.29524678 0.74877214
# loss 层,错位计算预测值并mask掉segment1 class CrossEntropy(Loss): def compute_loss(self, inputs, mask=None): y_true, y_mask, y_pred = inputs y_true = y_true[:, 1:] # 目标token_ids y_mask = y_mask[:, 1:] # segment_ids,刚好指示了要预测的部分 y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss # build model model = build_transformer_model(config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens) model.summary() # train model o_inputs = Input(shape=(None, )) train_model = Model(model.inputs + [o_inputs], model.outputs + [o_inputs]) y_true = train_model.inputs[2][:, 1:] y_mask = train_model.inputs[1][:, 1:] y_pred = train_model.outputs[0][:, :-1] cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) train_model.add_loss(cross_entropy) train_model.compile(Adam(1e-5))
def build_transformer_model_with_mlm(version='pre'): """带mlm的bert模型 """ assert version in ['pre', 'post', 'rezero'] if version == 'rezero': attention_name = 'Transformer-%d-MultiHeadSelfAttention' feed_forward_name = 'Transformer-%d-FeedForward' skip_weights = [] for i in range(12): skip_weights.append(feed_forward_name % i + '-Norm') skip_weights.append(feed_forward_name % i + '-ReWeight') skip_weights.append(attention_name % i + '-Norm') skip_weights.append(attention_name % i + '-ReWeight') bert = build_transformer_model( config_path, with_mlm='linear', model='rezero', return_keras_model=False, skip_weights_from_checkpoints=skip_weights, use_layernorm=None, reweight_trainable=True, init_reweight=0., ) else: bert = build_transformer_model( config_path, with_mlm='linear', model='rezero', return_keras_model=False, # skip_weights_from_checkpoints=skip_weights, use_layernorm=version, reweight_trainable=False, init_reweight=1., ) proba = bert.model.output # print(proba) # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype=K.floatx(), name='is_masked') # mask标记 # nsp_label = Input(shape=(None, ), dtype='int64', name='nsp') # nsp def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def nsp_loss(inputs): """计算nsp loss的函数,需要封装为一个层 """ y_true, y_pred = inputs # y_pred, _ = y_pred loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.mean(loss) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs # _, y_pred = y_pred y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc def nsp_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred = inputs y_pred, _ = y_pred y_true = K.cast(y_true, K.floatx) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.mean(acc) return acc mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) # nsp_loss = Lambda(nsp_loss, name='nsp_loss')([nsp_label, proba]) # nsp_acc = Lambda(nsp_acc, name='nsp_acc')([nsp_label, proba]) train_model = Model(bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]) loss = { 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), # 'nsp_loss': lambda y_true, y_pred: y_pred, # 'nsp_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), } return bert, train_model, loss
"""交叉熵作为loss,并mask掉输入部分 """ def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) self.add_metric(accuracy, name='accuracy') loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True, # model='bert', # 加载bert/Roberta/ernie model='nezha') target_in = Input(shape=(None, )) output = CrossEntropy(1)([target_in, model.output]) train_model = Model(model.inputs + [target_in], output) AdamW = extend_with_weight_decay(Adam) AdamWG = extend_with_gradient_accumulation(AdamW) opt = AdamWG(learning_rate=1e-5, exclude_from_weight_decay=['Norm', 'bias'], grad_accum_steps=4) train_model.compile(opt)