def get_model_v2(albert_config, max_seq_length, init_checkpoint, learning_rate, start_n_top, end_n_top, dropout, num_train_steps, num_warmup_steps): """Returns keras model""" squad_model = ALBertQAModel( albert_config, max_seq_length, init_checkpoint, start_n_top, end_n_top, dropout) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=learning_rate, decay_steps=num_train_steps, end_learning_rate=0.0) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) if FLAGS.optimizer == "LAMB": optimizer_fn = LAMB else: optimizer_fn = AdamWeightDecay optimizer = optimizer_fn( learning_rate=learning_rate_fn, weight_decay_rate=FLAGS.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=FLAGS.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) squad_model.optimizer = optimizer return squad_model
def get_model(albert_config, max_seq_length, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, loss_multiplier): """Returns keras fuctional model""" float_type = tf.float32 hidden_dropout_prob = FLAGS.classifier_dropout # as per original code relased input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_word_ids') input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_mask') input_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_type_ids') albert_layer = AlbertModel(config=albert_config, float_type=float_type) pooled_output, _ = albert_layer(input_word_ids, input_mask, input_type_ids) albert_model = tf.keras.Model( inputs=[input_word_ids, input_mask, input_type_ids], outputs=[pooled_output]) albert_model.load_weights(init_checkpoint) initializer = tf.keras.initializers.TruncatedNormal( stddev=albert_config.initializer_range) output = tf.keras.layers.Dropout(rate=hidden_dropout_prob)(pooled_output) output = tf.keras.layers.Dense(num_labels, kernel_initializer=initializer, name='output', dtype=float_type)(output) model = tf.keras.Model(inputs={ 'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids }, outputs=output) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=learning_rate, decay_steps=num_train_steps, end_learning_rate=0.0) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) if FLAGS.optimizer == "LAMB": optimizer_fn = LAMB else: optimizer_fn = AdamWeightDecay optimizer = optimizer_fn(learning_rate=learning_rate_fn, weight_decay_rate=FLAGS.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=FLAGS.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) if FLAGS.task_name.lower() == 'sts': loss_fct = tf.keras.losses.MeanSquaredError() model.compile(optimizer=optimizer, loss=loss_fct, metrics=['mse']) else: loss_fct = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) model.compile(optimizer=optimizer, loss=loss_fct, metrics=['accuracy']) return model
def get_model_v1(albert_config, max_seq_length, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps): """Returns keras fuctional model""" float_type = tf.float32 # hidden_dropout_prob = 0.9 # as per original code relased unique_ids = tf.keras.layers.Input( shape=(1,), dtype=tf.int32, name='unique_ids') input_word_ids = tf.keras.layers.Input( shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids') input_mask = tf.keras.layers.Input( shape=(max_seq_length,), dtype=tf.int32, name='input_mask') input_type_ids = tf.keras.layers.Input( shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids') albert_layer = AlbertModel(config=albert_config, float_type=float_type) _, sequence_output = albert_layer( input_word_ids, input_mask, input_type_ids) albert_model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=[sequence_output]) if init_checkpoint != None: albert_model.load_weights(init_checkpoint) initializer = tf.keras.initializers.TruncatedNormal( stddev=albert_config.initializer_range) squad_logits_layer = ALBertSquadLogitsLayer( initializer=initializer, float_type=float_type, name='squad_logits') start_logits, end_logits = squad_logits_layer(sequence_output) squad_model = tf.keras.Model( inputs={ 'unique_ids': unique_ids, 'input_ids': input_word_ids, 'input_mask': input_mask, 'segment_ids': input_type_ids, }, outputs=[unique_ids, start_logits, end_logits], name='squad_model') learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=learning_rate, decay_steps=num_train_steps, end_learning_rate=0.0) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) if FLAGS.optimizer == "LAMB": optimizer_fn = LAMB else: optimizer_fn = AdamWeightDecay optimizer = optimizer_fn( learning_rate=learning_rate_fn, weight_decay_rate=FLAGS.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=FLAGS.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) squad_model.optimizer = optimizer return squad_model
def run_customized_training(strategy, albert_config, max_seq_length, max_predictions_per_seq, model_dir, steps_per_epoch, steps_per_loop, epochs, initial_lr, warmup_steps, input_files, train_batch_size): """Run BERT pretrain model training using low-level API.""" train_input_fn = functools.partial(get_pretrain_input_data, input_files, max_seq_length, max_predictions_per_seq, train_batch_size, strategy) with strategy.scope(): pretrain_model, core_model = albert_model.pretrain_model( albert_config, max_seq_length, max_predictions_per_seq) if FLAGS.init_checkpoint: logging.info( f"pre-trained weights loaded from {FLAGS.init_checkpoint}") pretrain_model.load_weights(FLAGS.init_checkpoint) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=initial_lr, decay_steps=int(steps_per_epoch * epochs), end_learning_rate=0.0) if warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=initial_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=warmup_steps) if FLAGS.optimizer == "lamp": optimizer_fn = LAMB else: optimizer_fn = AdamWeightDecay optimizer = optimizer_fn( learning_rate=learning_rate_fn, weight_decay_rate=FLAGS.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=FLAGS.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) pretrain_model.optimizer = optimizer trained_model = run_customized_training_loop( strategy=strategy, model=pretrain_model, loss_fn=get_loss_fn(loss_factor=1.0 / strategy.num_replicas_in_sync), model_dir=model_dir, train_input_fn=train_input_fn, steps_per_epoch=steps_per_epoch, steps_per_loop=steps_per_loop, epochs=epochs) # Creates the BERT core model outside distribution strategy scope. _, core_model = albert_model.pretrain_model(albert_config, max_seq_length, max_predictions_per_seq) # Restores the core model from model checkpoints and save weights only # contains the core model. checkpoint = tf.train.Checkpoint(model=core_model) latest_checkpoint_file = tf.train.latest_checkpoint(model_dir) assert latest_checkpoint_file logging.info('Checkpoint file %s found and restoring from ' 'checkpoint', latest_checkpoint_file) status = checkpoint.restore(latest_checkpoint_file) status.assert_existing_objects_matched().expect_partial() core_model.save_weights(f"{model_dir}/tf2_model.h5") return trained_model
def main(_): logging.set_verbosity(logging.INFO) if FLAGS.enable_xla: set_config_v2(FLAGS.enable_xla) processors = { "cola": classifier_data_lib.ColaProcessor, "sts": classifier_data_lib.StsbProcessor, "sst": classifier_data_lib.Sst2Processor, "mnli": classifier_data_lib.MnliProcessor, "qnli": classifier_data_lib.QnliProcessor, "qqp": classifier_data_lib.QqpProcessor, "rte": classifier_data_lib.RteProcessor, "mrpc": classifier_data_lib.MrpcProcessor, "wnli": classifier_data_lib.WnliProcessor, "xnli": classifier_data_lib.XnliProcessor, } task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() logging.info("processor is : ", FLAGS.task_name) strategy = None if FLAGS.strategy_type == "one": strategy = tf.distribute.OneDeviceStrategy("GPU:0") elif FLAGS.strategy_type == "mirror": strategy = tf.distribute.MirroredStrategy() else: raise ValueError( 'The distribution strategy type is not supported: %s' % FLAGS.strategy_type) with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) num_labels = input_meta_data["num_labels"] FLAGS.max_seq_length = input_meta_data["max_seq_length"] processor_type = input_meta_data['processor_type'] if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) albert_config = AlbertConfig.from_json_file(FLAGS.albert_config_file) tinybert_config = TinybertConfig.from_json_file(FLAGS.tinybert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) tf.io.gfile.makedirs(FLAGS.output_dir) num_train_steps = None num_warmup_steps = None steps_per_epoch = None if FLAGS.do_train: len_train_examples = input_meta_data['train_data_size'] steps_per_epoch = int(len_train_examples / FLAGS.train_batch_size) num_train_steps = int(steps_per_epoch * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) if FLAGS.do_eval: eval_input_fn = functools.partial(create_classifier_dataset, FLAGS.eval_data_path, seq_length=FLAGS.max_seq_length, batch_size=FLAGS.eval_batch_size, is_training=False, drop_remainder=False) len_eval_examples = input_meta_data['eval_data_size'] eval_steps = int(len_eval_examples / FLAGS.eval_batch_size) loss_multiplier = 1.0 / strategy.num_replicas_in_sync model = None if FLAGS.do_train: logging.info("***** Running training *****") logging.info(" Num examples = %d", len_train_examples) logging.info(" Batch size = %d", FLAGS.train_batch_size) logging.info(" Num steps = %d", num_train_steps) # 为create方法固定4个参数值 train_input_fn = functools.partial(create_classifier_dataset, FLAGS.train_data_path, seq_length=FLAGS.max_seq_length, batch_size=FLAGS.train_batch_size, drop_remainder=False) with strategy.scope(): summary_dir = os.path.join(FLAGS.output_dir, 'summaries') summary_callback = tf.keras.callbacks.TensorBoard(summary_dir) checkpoint_path = os.path.join(FLAGS.output_dir, 'checkpoint') checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( checkpoint_path, save_weights_only=False) custom_callbacks = [summary_callback, checkpoint_callback] def metric_fn(): if FLAGS.task_name.lower() == "sts": return tf.keras.metrics.MeanSquaredError(dtype=tf.float32) else: return tf.keras.metrics.SparseCategoricalAccuracy( dtype=tf.float32) if FLAGS.custom_training_loop: if FLAGS.task_name.lower() == "sts": loss_fn = get_loss_fn_v2(loss_factor=loss_multiplier) else: loss_fn = get_loss_fn(num_labels, loss_factor=loss_multiplier) tinybert_config = TinybertConfig.from_json_file( FLAGS.tinybert_config_file) train_model, albert, tinybert = tinybert_model.get_fine_tune_model( tinybert_config, albert_config, FLAGS.max_seq_length) albert.summary() tinybert.summary() train_model.summary() model = train_model if FLAGS.optimizer == "LAMB": optimizer_fn = LAMB else: optimizer_fn = AdamWeightDecay learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=FLAGS.learning_rate, decay_steps=num_train_steps, end_learning_rate=0.0) if num_warmup_steps: learning_rate_fn = WarmUp( initial_learning_rate=FLAGS.learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) optimizer = optimizer_fn( learning_rate=learning_rate_fn, weight_decay_rate=FLAGS.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=FLAGS.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) train_model.optimizer = optimizer run_customized_training_loop( strategy=strategy, models=[albert, tinybert, train_model], model=train_model, albert=albert, tinybert=tinybert, loss_fn=get_loss_fn_v3(loss_factor=1.0 / strategy.num_replicas_in_sync), model_dir=FLAGS.output_dir, train_input_fn=train_input_fn, steps_per_epoch=steps_per_epoch, epochs=FLAGS.num_train_epochs, metric_fn=metric_fn, custom_callbacks=custom_callbacks) else: model = get_model(albert_config=albert_config, max_seq_length=FLAGS.max_seq_length, num_labels=num_labels, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, loss_multiplier=loss_multiplier) model.summary() training_dataset = train_input_fn() evaluation_dataset = eval_input_fn() model.fit(x=training_dataset, epochs=FLAGS.num_train_epochs, callbacks=custom_callbacks) if FLAGS.do_eval: if not model: raise ValueError("model not init") len_eval_examples = input_meta_data['eval_data_size'] logging.info("***** Running evaluation *****") logging.info(" Num examples = %d", len_eval_examples) logging.info(" Batch size = %d", FLAGS.eval_batch_size) evaluation_dataset = eval_input_fn() loss, accuracy = model.evaluate(evaluation_dataset) print(f"loss : {loss} , Accuracy : {accuracy}") if FLAGS.do_predict: logging.info("***** Running prediction*****") flags.mark_flag_as_required("input_data_dir") flags.mark_flag_as_required("predict_data_path") tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file) predict_examples = processor.get_test_examples(FLAGS.input_data_dir) label_list = processor.get_labels() label_map = {i: label for i, label in enumerate(label_list)} classifier_data_lib.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.predict_data_path) predict_input_fn = functools.partial(create_classifier_dataset, FLAGS.predict_data_path, seq_length=FLAGS.max_seq_length, batch_size=FLAGS.eval_batch_size, is_training=False, drop_remainder=False) prediction_dataset = predict_input_fn() with strategy.scope(): logits = model.predict(prediction_dataset) if FLAGS.task_name.lower() == "sts": predictions = logits probabilities = logits else: predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) probabilities = tf.nn.softmax(logits, axis=-1) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.io.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.io.gfile.GFile(output_submit_file, "w") as sub_writer: logging.info("***** Predict results *****") for (example, probability, prediction) in zip(predict_examples, probabilities, predictions): output_line = "\t".join( str(class_probability.numpy()) for class_probability in probability) + "\n" pred_writer.write(output_line) actual_label = label_map[int(prediction)] sub_writer.write( six.ensure_str(example.guid) + "\t" + actual_label + "\n")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-cased,bert-large-cased" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev/test set.") parser.add_argument("--eval_on", default="dev", type=str, help="Evaluation set, dev: Development, test: Test") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") # training stratergy arguments parser.add_argument( "--multi_gpu", action='store_true', help="Set this flag to enable multi-gpu training using MirroredStrategy." "Single gpu training") parser.add_argument( "--gpus", default='0', type=str, help="Comma separated list of gpus devices." "For Single gpu pass the gpu id.Default '0' GPU" "For Multi gpu,if gpus not specified all the available gpus will be used" ) args = parser.parse_args() processor = NerProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: tokenizer = FullTokenizer(os.path.join(args.bert_model, "vocab.txt"), args.do_lower_case) if args.multi_gpu: if len(args.gpus.split(',')) == 1: strategy = tf.distribute.MirroredStrategy() else: gpus = [f"/gpu:{gpu}" for gpu in args.gpus.split(',')] strategy = tf.distribute.MirroredStrategy(devices=gpus) else: gpu = args.gpus.split(',')[0] strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{gpu}") train_examples = None optimizer = None num_train_optimization_steps = 0 ner = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size) * args.num_train_epochs warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=args.learning_rate, decay_steps=num_train_optimization_steps, end_learning_rate=0.0) if warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=args.learning_rate, decay_schedule_fn=learning_rate_fn, warmup_steps=warmup_steps) optimizer = AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=args.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=args.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) with strategy.scope(): ner = BertNer(args.bert_model, tf.float32, num_labels, args.max_seq_length) loss_fct = tf.keras.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.NONE) label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_ids for f in train_features], dtype=np.int32)) all_input_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_mask for f in train_features], dtype=np.int32)) all_segment_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.segment_ids for f in train_features], dtype=np.int32)) all_valid_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.valid_ids for f in train_features], dtype=np.int32)) all_label_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_mask for f in train_features])) all_label_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_id for f in eval_features], dtype=np.int32)) # Dataset using tf.data train_data = tf.data.Dataset.zip( (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids, all_label_mask)) shuffled_train_data = train_data.shuffle(buffer_size=int( len(train_features) * 0.1), seed=args.seed, reshuffle_each_iteration=True) batched_train_data = shuffled_train_data.batch(args.train_batch_size) # Distributed dataset dist_dataset = strategy.experimental_distribute_dataset( batched_train_data) loss_metric = tf.keras.metrics.Mean() epoch_bar = master_bar(range(args.num_train_epochs)) pb_max_len = math.ceil( float(len(train_features)) / float(args.train_batch_size)) def train_step(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask): def step_fn(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask): with tf.GradientTape() as tape: logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=True) label_mask = tf.reshape(label_mask, (-1, )) logits = tf.reshape(logits, (-1, num_labels)) logits_masked = tf.boolean_mask(logits, label_mask) label_ids = tf.reshape(label_ids, (-1, )) label_ids_masked = tf.boolean_mask(label_ids, label_mask) cross_entropy = loss_fct(label_ids_masked, logits_masked) loss = tf.reduce_sum(cross_entropy) * ( 1.0 / args.train_batch_size) grads = tape.gradient(loss, ner.trainable_variables) optimizer.apply_gradients( list(zip(grads, ner.trainable_variables))) return cross_entropy per_example_losses = strategy.experimental_run_v2( step_fn, args=(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask)) mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0) return mean_loss for epoch in epoch_bar: with strategy.scope(): for (input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) in progress_bar(dist_dataset, total=pb_max_len, parent=epoch_bar): loss = train_step(input_ids, input_mask, segment_ids, valid_ids, label_ids, label_mask) loss_metric(loss) epoch_bar.child.comment = f'loss : {loss_metric.result()}' loss_metric.reset_states() # model weight save ner.save_weights(os.path.join(args.output_dir, "model.h5")) # copy vocab to output_dir shutil.copyfile(os.path.join(args.bert_model, "vocab.txt"), os.path.join(args.output_dir, "vocab.txt")) # copy bert config to output_dir shutil.copyfile(os.path.join(args.bert_model, "bert_config.json"), os.path.join(args.output_dir, "bert_config.json")) # save label_map and max_seq_length of trained model model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": num_labels, "label_map": label_map } json.dump(model_config, open(os.path.join(args.output_dir, "model_config.json"), "w"), indent=4) if args.do_eval: # load tokenizer tokenizer = FullTokenizer(os.path.join(args.output_dir, "vocab.txt"), args.do_lower_case) # model build hack : fix config = json.load( open(os.path.join(args.output_dir, "bert_config.json"))) ner = BertNer(config, tf.float32, num_labels, args.max_seq_length) ids = tf.ones((1, 128), dtype=tf.int32) _ = ner(ids, ids, ids, ids, training=False) ner.load_weights(os.path.join(args.output_dir, "model.h5")) # load test or development set based on argsK if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evalution *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_ids for f in eval_features], dtype=np.int32)) all_input_mask = tf.data.Dataset.from_tensor_slices( np.asarray([f.input_mask for f in eval_features], dtype=np.int32)) all_segment_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.segment_ids for f in eval_features], dtype=np.int32)) all_valid_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.valid_ids for f in eval_features], dtype=np.int32)) all_label_ids = tf.data.Dataset.from_tensor_slices( np.asarray([f.label_id for f in eval_features], dtype=np.int32)) eval_data = tf.data.Dataset.zip( (all_input_ids, all_input_mask, all_segment_ids, all_valid_ids, all_label_ids)) batched_eval_data = eval_data.batch(args.eval_batch_size) loss_metric = tf.keras.metrics.Mean() epoch_bar = master_bar(range(1)) pb_max_len = math.ceil( float(len(eval_features)) / float(args.eval_batch_size)) y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for epoch in epoch_bar: for (input_ids, input_mask, segment_ids, valid_ids, label_ids) in progress_bar(batched_eval_data, total=pb_max_len, parent=epoch_bar): logits = ner(input_ids, input_mask, segment_ids, valid_ids, training=False) logits = tf.argmax(logits, axis=2) for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j].numpy() == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j].numpy()]) temp_2.append(label_map[logits[i][j].numpy()]) report = classification_report(y_true, y_pred, digits=4) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def run_customized_training(strategy, albert_config, tinybert_config, max_seq_length, max_predictions_per_seq, model_dir, steps_per_epoch, steps_per_loop, epochs, initial_lr, warmup_steps, input_files, train_batch_size, use_mlm_loss): """Run BERT pretrain model training using low-level API.""" train_input_fn = functools.partial(get_pretrain_input_data, input_files, max_seq_length, max_predictions_per_seq, train_batch_size, strategy) with strategy.scope(): # albert, albert_encoder = albert_model.pretrain_model( # albert_config, max_seq_length, max_predictions_per_seq) train_model, albert, tinybert = tinybert_model.train_tinybert_model( tinybert_config, albert_config, max_seq_length, max_predictions_per_seq) albert.summary() tinybert.summary() train_model.summary() if FLAGS.init_checkpoint: logging.info( f"model pre-trained weights loaded from {FLAGS.init_checkpoint}" ) train_model.load_weights(FLAGS.init_checkpoint) learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=initial_lr, decay_steps=int(steps_per_epoch * epochs), end_learning_rate=0.0) if warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=initial_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=warmup_steps) if FLAGS.optimizer == "lamp": optimizer_fn = LAMB else: optimizer_fn = AdamWeightDecay optimizer = optimizer_fn( learning_rate=learning_rate_fn, weight_decay_rate=FLAGS.weight_decay, beta_1=0.9, beta_2=0.999, epsilon=FLAGS.adam_epsilon, exclude_from_weight_decay=['layer_norm', 'bias']) train_model.optimizer = optimizer # 注意这里的model_dir是albert和tinybert共享,需要修改 if FLAGS.do_train: trained_model = run_customized_training_loop( strategy=strategy, models=[albert, tinybert, train_model], model=train_model, albert=albert, tinybert=tinybert, start_wtih_trained_model=FLAGS.start_with_train_model, loss_fn=get_loss_fn(loss_factor=1.0 / strategy.num_replicas_in_sync), model_dir=model_dir, train_input_fn=train_input_fn, steps_per_epoch=steps_per_epoch, steps_per_loop=steps_per_loop, epochs=epochs, ) # Creates the BERT core model outside distribution strategy scope. training, albert, tinybert = tinybert_model.train_tinybert_model( tinybert_config, albert_config, max_seq_length, max_predictions_per_seq) # Restores the core model from model checkpoints and save weights only # contains the core model. # 在training的过程中会保存ckpt的模型文件,在训练结束后从ckpt读出模型再存储为h5文件 # 寻找albert模型文件 checkpoint_model = tf.train.Checkpoint(model=training) latest_checkpoint_file = tf.train.latest_checkpoint(model_dir) assert latest_checkpoint_file logging.info('Checkpoint file %s found and restoring from ' 'checkpoint', latest_checkpoint_file) status = checkpoint_model.restore(latest_checkpoint_file) status.assert_existing_objects_matched().expect_partial() # 寻找tinybert模型文件 # checkpoint_tinybert = tf.train.Checkpoint(model=tinybert) # latest_tinybert_checkpoint_file = tf.train.latest_checkpoint(tinybert_model_dir) # assert latest_tinybert_checkpoint_file # logging.info('Checkpoint_Tinybert file %s found and restoring from ' # 'checkpoint', latest_tinybert_checkpoint_file) # status_tinybert = checkpoint_albert.restore(latest_tinybert_checkpoint_file) # status_tinybert.assert_existing_objects_matched().expect_partial() # 创建存储文件 if not os.path.exists(model_dir + '/models/'): os.makedirs(model_dir + '/models/') albert.save_weights(f"{model_dir}/models/albert_model.h5") tinybert.save_weights(f"{model_dir}/models/tinybert_model.h5")