tpu_address = 'grpc://xxx.xxx.xxx.xxx:8470' # 如果用多GPU跑,直接设为None which_optimizer = 'lamb' # adam 或 lamb,均自带weight decay lr_schedule = { num_warmup_steps * grad_accum_steps: 1., num_train_steps * grad_accum_steps: 0., } # 准备变量 Input = keras.layers.Input Lambda = keras.layers.Lambda Model = keras.models.Model # 读取数据集,构建数据张量 dataset = TrainingDataset.load_tfrecord( record_names=corpus_paths, sequence_length=sequence_length, batch_size=batch_size // grad_accum_steps, ) def build_train_bert_model(): """构建训练模型,通用于TPU/GPU 注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的 写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有 tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算 时要格外留意。 """ bert = build_bert_model(config_path, with_mlm='linear', application='lm', return_keras_model=False)
exclude_from_weight_decay = ['Norm', 'bias'] # 准备一些变量 Input = keras.layers.Input Lambda = keras.layers.Lambda Model = keras.models.Model sparse_categorical_accuracy = keras.metrics.sparse_categorical_accuracy ModelCheckpoint = keras.callbacks.ModelCheckpoint CSVLogger = keras.callbacks.CSVLogger # 读取数据集,构建数据张量 dataset = TrainingDataset.load_tfrecord( record_names=corpus_path, sequence_length=sequence_length, batch_size=batch_size, ) # 构建优化器 class PiecewiseLinear(keras.optimizers.schedules.LearningRateSchedule): """为tf.keras的OptimizerV2所写的分段线性学习率 """ def __init__(self, schedule, name=None): super(PiecewiseLinear, self).__init__() self.schedule = {int(i): j for i, j in schedule.items()} self.name = name def __call__(self, step):