def __init__(self, config_path, checkpoint_path, hidden_size, num_classes, ls_e=0.1, model_type='bert'): def ls_loss(y_true, y_pred, e=ls_e): loss1 = K.categorical_crossentropy(y_true, y_pred) loss2 = K.categorical_crossentropy( K.ones_like(y_pred) / num_classes, y_pred) return (1 - e) * loss1 + e * loss2 self.num_classes = num_classes bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=model_type, return_keras_model=False) text_emb = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) text_emb = Dense(hidden_size, activation='tanh')(text_emb) output = Dense(num_classes, activation='softmax')(text_emb) self.model = Model(bert.model.input, output) AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') self.model.compile(loss=ls_loss, optimizer=AdamLR(learning_rate=1e-4, lr_schedule={ 1000: 1, 2000: 0.1 }))
def taggerRewriterModel(model_name, config_path, checkpoint_path, num_classes=5, learning_rate=3e-5): # 补充输入 start_labels = Input(shape=(1, ), name='Start-Labels') end_labels = Input(shape=(1, ), name='End-Lables') insert_pos_labels = Input(shape=(1, ), name='Insert-Pos-Labels') start_ner_labels = Input(shape=(1, ), name='Start-NER-Labels') end_ner_labels = Input(shape=(1, ), name='End-NER-Labels') # 加载预训练模型 bert = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model='albert', return_keras_model=False, ) output = bert.model.output output = Dense(units=num_classes, activation='linear', kernel_initializer=bert.initializer)(output) start_pred = Lambda(lambda x: x[:, :, 0], name='start')(output) end_pred = Lambda(lambda x: x[:, :, 1], name='end')(output) insert_pos_pred = Lambda(lambda x: x[:, :, 2], name='insrt_pos')(output) start_ner_pred = Lambda(lambda x: x[:, :, 3], name='start_ner')(output) end_ner_pred = Lambda(lambda x: x[:, :, 4], name='end_ner')(output) start_pred, end_pred, insert_pos_pred, start_ner_pred, end_ner_pred = PointerLoss( [5, 6, 7, 8, 9])([ start_labels, end_labels, insert_pos_labels, start_ner_labels, end_ner_labels, start_pred, end_pred, insert_pos_pred, start_ner_pred, end_ner_pred ]) model = keras.models.Model( bert.model.inputs + [ start_labels, end_labels, insert_pos_labels, start_ner_labels, end_ner_labels ], [start_pred, end_pred, insert_pos_pred, start_ner_pred, end_ner_pred]) model.summary() # 派生为带分段线性学习率的优化器。 # 其中name参数可选,但最好填入,以区分不同的派生优化器。 AdamLR = extend_with_piecewise_linear_lr(Adam) model.compile( # optimizer=Adam(1e-5), # 用足够小的学习率 optimizer=AdamLR(learning_rate=learning_rate, lr_schedule={ 1000: 1, 2000: 0.1 }), metrics=None, ) return model
def fit(self, train_filepath, valid_filepath, temp_save_path, maxlen=128, learning_rate=1e-4, epochs=5, batch_size=32): train_data = load_data(train_filepath) train_generator = CmtDataGenerator(train_data, batch_size, self.tokenizer) callbacks = None if valid_filepath != "" and valid_filepath is not None \ and temp_save_path != "" and temp_save_path is not None: valid_data = load_data(valid_filepath) valid_generator = CmtDataGenerator(valid_data, batch_size, self.tokenizer) evaluator = Evaluator(self.model, valid_generator, temp_save_path) callbacks = [evaluator] AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') self.model.compile( loss='sparse_categorical_crossentropy', optimizer=AdamLR(learning_rate=learning_rate, lr_schedule={ 1000: 1, 2000: 0.1 }), metrics=['accuracy'], ) self.model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=callbacks ) if callbacks is not None: self.model.load_weights(temp_save_path)
def build_transformer_model_for_pretraining(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert, train_model, loss = build_transformer_model_with_mlm() # 优化器 optimizer = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': optimizer = extend_with_layer_adaptation(optimizer) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: optimizer = extend_with_gradient_accumulation(optimizer) optimizer_params['grad_accum_steps'] = grad_accum_steps optimizer = optimizer(**optimizer_params) # 模型定型 train_model.compile(loss=loss, optimizer=optimizer) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
def build(self): bert_model, _ = load_bert( config_path=os.path.join(self.config['pretrained_model_dir'], 'bert_config.json'), checkpoint_path=os.path.join(self.config['pretrained_model_dir'], 'bert_model.ckpt'), ) text_mask = L.Lambda( lambda x: K.cast(K.expand_dims(K.greater(x, 0), 2), K.floatx()))( bert_model.input[0]) # GI gi_in = L.Input(name="gi", shape=(self.config["max_len"], ), dtype="float32") gi = gi_in # AGN X = bert_model.output gi = L.Dense(self.config['max_len'], activation='tanh')(gi) # (B, L) gi = L.Lambda(lambda x: K.expand_dims(x, 2))(gi) # (B, L, 1) X, attn_weight = AGN(epsilon=self.config['epsilon'])([X, gi]) X = L.Lambda(lambda x: x[0] - 1e10 * (1.0 - x[1]))([X, text_mask]) output = L.Lambda(lambda x: K.max(x, 1))(X) #output = L.Dense(128, activation='relu')(output) output = L.Dropout(self.config.get('dropout', 0.2))(output) output = L.Dense(self.config['output_size'], activation='softmax')(output) self.model = keras.Model(inputs=(*bert_model.input, gi_in), outputs=output) self.attn_model = keras.Model(inputs=(*bert_model.input, gi_in), outputs=attn_weight) optimizer = extend_with_weight_decay(Adam) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': self.config['learning_rate'], 'lr_schedule': { self.config['steps_per_epoch'] * 2: 1, self.config['steps_per_epoch'] * 3: 0.2, self.config['steps_per_epoch'] * self.config['epochs']: 0.1 }, 'weight_decay_rate': 0.01, 'exclude_from_weight_decay': ['Norm', 'bias'], 'bias_correction': False, } self.model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer(**optimizer_params), ) self.model.summary() if self.config.get('apply_fgm', True): print('apply fgm') fgm(self.model, 'Embedding-Token', self.config.get('fgm_epsilon', 0.2))
def compile_model(self): # 派生为带分段线性学习率的优化器。 # 其中name参数可选,但最好填入,以区分不同的派生优化器。 AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') self.model.compile(loss=self.loss, optimizer=AdamLR(lr=self.learning_rate, lr_schedule={ 1000: 1, 2000: 0.1 }), metrics=self.metrics, )
def get_suggested_optimizer(init_lr=5e-5, total_steps=None): lr_schedule = {1000: 1, 10000: 0.01} if total_steps is not None: lr_schedule = {total_steps // 10: 1, total_steps: 0.1} optimizer = extend_with_weight_decay(Adam) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': init_lr, 'lr_schedule': lr_schedule, 'weight_decay_rate': 0.01, 'exclude_from_weight_decay': ['Norm', 'bias'], 'bias_correction': False, } optimizer = optimizer(**optimizer_params) return optimizer
def get_optimizer_cls(self, optimizer_cls): optimizer_list = [ "Adam", "Adamax", "Adagrad", "Nadam", "Adadelta", "SGD", "RMSprop", ] if isinstance(optimizer_cls, str): if optimizer_cls and optimizer_cls in optimizer_list: optimizer_cls = getattr(tf.keras.optimizers, optimizer_cls) if not issubclass(optimizer_cls, tf.keras.optimizers.Optimizer): raise Exception(f"指定的 Optimizer 类别不正确!{optimizer_cls}") if self.optimize_with_piecewise_linear_lr: optimizer_cls = extend_with_piecewise_linear_lr(optimizer_cls) return optimizer_cls
def __init_model(self, config_path, checkpoint_path): bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='electra', return_keras_model=False) output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) output = Dense(units=self.num_classes, activation='softmax', kernel_initializer=bert.initializer)(output) AdamLR = extend_with_piecewise_linear_lr(Adam) model = keras.models.Model(bert.model.input, output) model.compile( loss='sparse_categorical_crossentropy', optimizer=AdamLR(learning_rate=1e-3, lr_schedule={ 1000: 1, 2000: 0.1 }), metrics=['accuracy'], ) return model
bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, albert=True, return_keras_model=False, ) output = Dropout(rate=0.1)(bert.model.output) output = Dense(units=2, activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model(bert.model.input, output) model.summary() AdamLR = extend_with_piecewise_linear_lr(Adam) model.compile( loss='sparse_categorical_crossentropy', optimizer=AdamLR(learning_rate=1e-4, lr_schedule={ 1000: 1, 2000: 0.1 }), metrics=['accuracy'], ) # 转换数据集 train_generator = data_generator(train_data) valid_generator = data_generator(valid_data) test_generator = data_generator(test_data)
elif config.lstm: output = Bidirectional(LSTM(256))(output) else: pass # output =GlobalAvgPool1D()(output) #平均池化 output = Dropout(0.15)(output) output = Dense(units=num_classes, activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model(bert.model.input, output) model.summary() # 派生为带分段线性学习率的优化器。 # 其中name参数可选,但最好填入,以区分不同的派生优化器。 AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') if config.addadv: """添加扰动""" loss = loss_with_gradient_penalty() else: loss = 'categorical_crossentropy' model.compile( loss='categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=AdamLR(learning_rate=1e-4, lr_schedule={ # 1000: 1, # 2000: 0.1 # }), # metrics=['accuracy'],
def build_train_bert_model(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert = build_bert_model(config_path, with_mlm='linear', application='lm', return_keras_model=False) token_ids = bert.model.input[0] proba = bert.model.output def lm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs y_true = y_true[:, 1:] y_pred = y_pred[:, :-1] mask = mask[:, 1:] loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) return loss def lm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, mask = inputs y_true = K.cast(y_true, K.floatx()) y_true = y_true[:, 1:] y_pred = y_pred[:, :-1] mask = mask[:, 1:] acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) return acc mask = bert.model.get_layer('Sequence-Mask').output loss = Lambda(lm_loss, name='lm_loss')([token_ids, proba, mask]) acc = Lambda(lm_acc, name='lm_acc')([token_ids, proba, mask]) train_model = Model(bert.model.inputs, [loss, acc]) # 优化器 optimizer = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': optimizer = extend_with_layer_adaptation(optimizer) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: optimizer = extend_with_gradient_accumulation(optimizer) optimizer_params['grad_accum_steps'] = grad_accum_steps optimizer = optimizer(**optimizer_params) # 模型定型 train_model.compile( loss={ 'lm_loss': lambda y_true, y_pred: y_pred, 'lm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), }, optimizer=optimizer, ) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
def build_train_bert_model(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert = build_bert_model(config_path, with_mlm='linear', return_keras_model=False) bert_model = bert.model proba = bert_model.output # 辅助输入 token_ids = Input(shape=(None, ), dtype='int64', name='token_ids') # 目标id is_masked = Input(shape=(None, ), dtype='bool', name='is_masked') # mask标记 def mlm_loss(inputs): """计算loss的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) loss = K.sum(loss * is_masked) / (K.sum(is_masked) + K.epsilon()) return loss def mlm_acc(inputs): """计算准确率的函数,需要封装为一个层 """ y_true, y_pred, is_masked = inputs is_masked = K.cast(is_masked, K.floatx()) y_true = K.cast(y_true, K.floatx()) acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) acc = K.sum(acc * is_masked) / (K.sum(is_masked) + K.epsilon()) return acc loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) train_model = Model(bert_model.inputs + [token_ids, is_masked], [loss, acc]) # 优化器 optimizer = extend_with_weight_decay(Adam) if which_optimizer == 'lamb': optimizer = extend_with_layer_adaptation(optimizer) optimizer = extend_with_piecewise_linear_lr(optimizer) optimizer_params = { 'learning_rate': learning_rate, 'lr_schedule': lr_schedule, 'weight_decay_rate': weight_decay_rate, 'exclude_from_weight_decay': exclude_from_weight_decay, 'bias_correction': False, } if grad_accum_steps > 1: optimizer = extend_with_gradient_accumulation(optimizer) optimizer_params['grad_accum_steps'] = grad_accum_steps optimizer = optimizer(**optimizer_params) # 模型定型 train_model.compile( loss={ 'mlm_loss': lambda y_true, y_pred: y_pred, 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), }, optimizer=optimizer, ) # 如果传入权重,则加载。注:须在此处加载,才保证不报错。 if checkpoint_path is not None: bert.load_weights_from_checkpoint(checkpoint_path) return train_model
def optimizer(self): AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') _optimizer = AdamLR(lr=1e-5, lr_schedule={1000: 1, 2000: 0.1}) return _optimizer
Author : chenhao date: 2021/3/30 ------------------------------------------------- Change Activity: 2021/3/30: ------------------------------------------------- """ from tensorflow.keras.optimizers import (Adadelta, Adagrad, Adamax, Nadam, RMSprop, SGD, Adam) from bert4keras.optimizers import extend_with_exponential_moving_average, extend_with_piecewise_linear_lr, \ extend_with_gradient_accumulation AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA') # 变成带分段线性学习率的Adam AdamLR = extend_with_piecewise_linear_lr(Adam, 'AdamLR') # 梯度累积的Adam AdamAcc = extend_with_gradient_accumulation(Adam, 'AdamAcc') # 梯度累积的分段线性学习率Adam AdamAccLR = extend_with_piecewise_linear_lr(AdamAcc, 'AdamAccLR') class OptimizerFactory: _BUILDERS = { 'sgd': SGD, 'rmsprop': RMSprop, 'adagrad': Adagrad, 'adadelta': Adadelta, 'adam': Adam, 'adamax': Adamax, 'nadam': Nadam,
def __init__(self, config_path, checkpoint_path, hidden_size, num_classes, alpha, wvdim=768, model_type='bert', label_embedding_matrix=None): self.num_classes = num_classes def lcm_loss(y_true, y_pred, alpha=alpha): pred_porbs = y_pred[:, :num_classes] label_sim_dist = y_pred[:, num_classes:] simulated_y_true = K.softmax(label_sim_dist + alpha * y_true) loss1 = -K.categorical_crossentropy(simulated_y_true, simulated_y_true) loss2 = K.categorical_crossentropy(simulated_y_true, pred_probs) return loss1 + loss2 def ls_loss(y_true, y_pred, e=0.1): loss1 = K.categorical_crossentropy(y_true, y_pred) loss2 = K.categorical_crossentropy( K.ones_like(y_pred) / num_classes, y_pred) return (1 - e) * loss1 + e * loss2 # text_encoder: bert = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model=model_type, return_keras_model=False) text_emb = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) text_emb = Dense(hidden_size, activation='tanh')(text_emb) pred_probs = Dense(num_classes, activation='softmax')(text_emb) self.basic_predictor = Model(bert.model.input, pred_probs) AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') self.basic_predictor.compile(loss='categorical_crossentropy', optimizer=AdamLR(learning_rate=1e-4, lr_schedule={ 1000: 1, 2000: 0.1 })) # label_encoder: label_input = Input(shape=(num_classes, ), name='label_input') if label_embedding_matrix is None: # 不使用pretrained embedding label_emb = Embedding(num_classes, wvdim, input_length=num_classes, name='label_emb1')(label_input) # (n,wvdim) else: label_emb = Embedding(num_classes, wvdim, input_length=num_classes, weights=[label_embedding_matrix], name='label_emb1')(label_input) # label_emb = Bidirectional(LSTM(hidden_size,return_sequences=True),merge_mode='ave')(label_emb) # (n,d) label_emb = Dense(hidden_size, activation='tanh', name='label_emb2')(label_emb) # similarity part: doc_product = Dot(axes=(2, 1))([label_emb, text_emb]) # (n,d) dot (d,1) --> (n,1) label_sim_dict = Dense(num_classes, activation='softmax', name='label_sim_dict')(doc_product) # concat output: concat_output = Concatenate()([pred_probs, label_sim_dict]) # compile; AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') self.model = Model(bert.model.input + [label_input], concat_output) self.model.compile(loss=lcm_loss, optimizer=AdamLR(learning_rate=1e-4, lr_schedule={ 1000: 1, 2000: 0.1 }))